diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/evaluation/__pycache__/observation_function.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/evaluation/__pycache__/observation_function.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..d3e28fd77a6d874c21c8deb9faf988e8e0eb53db Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/evaluation/__pycache__/observation_function.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/evaluation/__pycache__/sample_batch_builder.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/evaluation/__pycache__/sample_batch_builder.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..1ada321da8979f0a32c555dc1c986b5ad3a5da21 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/evaluation/__pycache__/sample_batch_builder.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/evaluation/__pycache__/worker_set.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/evaluation/__pycache__/worker_set.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..7791fc17feb7f573f5f82c2a9288b420b27fe891 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/evaluation/__pycache__/worker_set.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/evaluation/collectors/__init__.py b/.venv/lib/python3.11/site-packages/ray/rllib/evaluation/collectors/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/evaluation/collectors/__pycache__/__init__.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/evaluation/collectors/__pycache__/__init__.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..327801a0ffdc19da86cc7e6c3f90170694c09118 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/evaluation/collectors/__pycache__/__init__.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/evaluation/collectors/__pycache__/agent_collector.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/evaluation/collectors/__pycache__/agent_collector.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..4b22db4ec1586e5ac1f371538235c6d7b24462b3 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/evaluation/collectors/__pycache__/agent_collector.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/evaluation/collectors/__pycache__/sample_collector.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/evaluation/collectors/__pycache__/sample_collector.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..41ca589ef40746112ca7cd6f0612dab60b48720a Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/evaluation/collectors/__pycache__/sample_collector.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/evaluation/collectors/__pycache__/simple_list_collector.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/evaluation/collectors/__pycache__/simple_list_collector.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..1301013fd4111457d9606dcb95081296346cb0a4 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/evaluation/collectors/__pycache__/simple_list_collector.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/evaluation/collectors/agent_collector.py b/.venv/lib/python3.11/site-packages/ray/rllib/evaluation/collectors/agent_collector.py new file mode 100644 index 0000000000000000000000000000000000000000..0628cbcb9718cbae1e4539ca07f0f2f7b25a0989 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/rllib/evaluation/collectors/agent_collector.py @@ -0,0 +1,688 @@ +import copy +import logging +import math +from typing import Any, Dict, List, Optional + +import numpy as np +import tree # pip install dm_tree +from gymnasium.spaces import Space + +from ray.rllib.policy.sample_batch import SampleBatch +from ray.rllib.policy.view_requirement import ViewRequirement +from ray.rllib.utils.annotations import OldAPIStack +from ray.rllib.utils.framework import try_import_torch +from ray.rllib.utils.spaces.space_utils import ( + flatten_to_single_ndarray, + get_dummy_batch_for_space, +) +from ray.rllib.utils.typing import ( + EpisodeID, + EnvID, + TensorType, + ViewRequirementsDict, +) + +logger = logging.getLogger(__name__) + +torch, _ = try_import_torch() + + +def _to_float_np_array(v: List[Any]) -> np.ndarray: + if torch and torch.is_tensor(v[0]): + raise ValueError + arr = np.array(v) + if arr.dtype == np.float64: + return arr.astype(np.float32) # save some memory + return arr + + +def _get_buffered_slice_with_paddings(d, inds): + element_at_t = [] + for index in inds: + if index < len(d): + element_at_t.append(d[index]) + else: + # zero pad similar to the last element. + element_at_t.append(tree.map_structure(np.zeros_like, d[-1])) + return element_at_t + + +@OldAPIStack +class AgentCollector: + """Collects samples for one agent in one trajectory (episode). + + The agent may be part of a multi-agent environment. Samples are stored in + lists including some possible automatic "shift" buffer at the beginning to + be able to save memory when storing things like NEXT_OBS, PREV_REWARDS, + etc.., which are specified using the trajectory view API. + """ + + _next_unroll_id = 0 # disambiguates unrolls within a single episode + + # TODO: @kourosh add different types of padding. e.g. zeros vs. same + def __init__( + self, + view_reqs: ViewRequirementsDict, + *, + max_seq_len: int = 1, + disable_action_flattening: bool = True, + intial_states: Optional[List[TensorType]] = None, + is_policy_recurrent: bool = False, + is_training: bool = True, + _enable_new_api_stack: bool = False, + ): + """Initialize an AgentCollector. + + Args: + view_reqs: A dict of view requirements for the agent. + max_seq_len: The maximum sequence length to store. + disable_action_flattening: If True, don't flatten the action. + intial_states: The initial states from the policy.get_initial_states() + is_policy_recurrent: If True, the policy is recurrent. + is_training: Sets the is_training flag for the buffers. if True, all the + timesteps are stored in the buffers until explictly build_for_training + () is called. if False, only the content required for the last time + step is stored in the buffers. This will save memory during inference. + You can change the behavior at runtime by calling is_training(mode). + """ + self.max_seq_len = max_seq_len + self.disable_action_flattening = disable_action_flattening + self.view_requirements = view_reqs + # The initial_states can be an np array + self.initial_states = intial_states if intial_states is not None else [] + self.is_policy_recurrent = is_policy_recurrent + self._is_training = is_training + self._enable_new_api_stack = _enable_new_api_stack + + # Determine the size of the buffer we need for data before the actual + # episode starts. This is used for 0-buffering of e.g. prev-actions, + # or internal state inputs. + view_req_shifts = [ + min(vr.shift_arr) + - int((vr.data_col or k) in [SampleBatch.OBS, SampleBatch.INFOS]) + for k, vr in view_reqs.items() + ] + self.shift_before = -min(view_req_shifts) + + # The actual data buffers. Keys are column names, values are lists + # that contain the sub-components (e.g. for complex obs spaces) with + # each sub-component holding a list of per-timestep tensors. + # E.g.: obs-space = Dict(a=Discrete(2), b=Box((2,))) + # buffers["obs"] = [ + # [0, 1], # <- 1st sub-component of observation + # [np.array([.2, .3]), np.array([.0, -.2])] # <- 2nd sub-component + # ] + # NOTE: infos and state_out... are not flattened due to them often + # using custom dict values whose structure may vary from timestep to + # timestep. + self.buffers: Dict[str, List[List[TensorType]]] = {} + # Maps column names to an example data item, which may be deeply + # nested. These are used such that we'll know how to unflatten + # the flattened data inside self.buffers when building the + # SampleBatch. + self.buffer_structs: Dict[str, Any] = {} + # The episode ID for the agent for which we collect data. + self.episode_id = None + # The unroll ID, unique across all rollouts (within a RolloutWorker). + self.unroll_id = None + # The simple timestep count for this agent. Gets increased by one + # each time a (non-initial!) observation is added. + self.agent_steps = 0 + # Keep track of view requirements that have a view on columns that we gain from + # inference and also need for inference. These have dummy values appended in + # buffers to account for the missing value when building for inference + # Example: We have one 'state_in' view requirement that has a view on our + # state_outs at t=[-10, ..., -1]. At any given build_for_inference()-call, + # the buffer must contain eleven values from t=[-10, ..., 0] for us to index + # properly. Since state_out at t=0 is missing, we substitute it with a buffer + # value that should never make it into batches built for training. + self.data_cols_with_dummy_values = set() + + @property + def training(self) -> bool: + return self._is_training + + def is_training(self, is_training: bool) -> None: + self._is_training = is_training + + def is_empty(self) -> bool: + """Returns True if this collector has no data.""" + return not self.buffers or all(len(item) == 0 for item in self.buffers.values()) + + def add_init_obs( + self, + episode_id: EpisodeID, + agent_index: int, + env_id: EnvID, + init_obs: TensorType, + init_infos: Optional[Dict[str, TensorType]] = None, + t: int = -1, + ) -> None: + """Adds an initial observation (after reset) to the Agent's trajectory. + + Args: + episode_id: Unique ID for the episode we are adding the + initial observation for. + agent_index: Unique int index (starting from 0) for the agent + within its episode. Not to be confused with AGENT_ID (Any). + env_id: The environment index (in a vectorized setup). + init_obs: The initial observation tensor (after `env.reset()`). + init_infos: The initial infos dict (after `env.reset()`). + t: The time step (episode length - 1). The initial obs has + ts=-1(!), then an action/reward/next-obs at t=0, etc.. + """ + # Store episode ID + unroll ID, which will be constant throughout this + # AgentCollector's lifecycle. + self.episode_id = episode_id + if self.unroll_id is None: + self.unroll_id = AgentCollector._next_unroll_id + AgentCollector._next_unroll_id += 1 + + # convert init_obs to np.array (in case it is a list) + if isinstance(init_obs, list): + init_obs = np.array(init_obs) + + if SampleBatch.OBS not in self.buffers: + single_row = { + SampleBatch.OBS: init_obs, + SampleBatch.INFOS: init_infos or {}, + SampleBatch.AGENT_INDEX: agent_index, + SampleBatch.ENV_ID: env_id, + SampleBatch.T: t, + SampleBatch.EPS_ID: self.episode_id, + SampleBatch.UNROLL_ID: self.unroll_id, + } + + # TODO (Artur): Remove when PREV_ACTIONS and PREV_REWARDS get deprecated. + # Note (Artur): As long as we have these in our default view requirements, + # we should build buffers with neutral elements instead of building them + # on the first AgentCollector.build_for_inference call if present. + # This prevents us from accidentally building buffers with duplicates of + # the first incoming value. + if SampleBatch.PREV_REWARDS in self.view_requirements: + single_row[SampleBatch.REWARDS] = get_dummy_batch_for_space( + space=self.view_requirements[SampleBatch.REWARDS].space, + batch_size=0, + fill_value=0.0, + ) + if SampleBatch.PREV_ACTIONS in self.view_requirements: + potentially_flattened_batch = get_dummy_batch_for_space( + space=self.view_requirements[SampleBatch.ACTIONS].space, + batch_size=0, + fill_value=0.0, + ) + if not self.disable_action_flattening: + potentially_flattened_batch = flatten_to_single_ndarray( + potentially_flattened_batch + ) + single_row[SampleBatch.ACTIONS] = potentially_flattened_batch + self._build_buffers(single_row) + + # Append data to existing buffers. + flattened = tree.flatten(init_obs) + for i, sub_obs in enumerate(flattened): + self.buffers[SampleBatch.OBS][i].append(sub_obs) + self.buffers[SampleBatch.INFOS][0].append(init_infos or {}) + self.buffers[SampleBatch.AGENT_INDEX][0].append(agent_index) + self.buffers[SampleBatch.ENV_ID][0].append(env_id) + self.buffers[SampleBatch.T][0].append(t) + self.buffers[SampleBatch.EPS_ID][0].append(self.episode_id) + self.buffers[SampleBatch.UNROLL_ID][0].append(self.unroll_id) + + def add_action_reward_next_obs(self, input_values: Dict[str, TensorType]) -> None: + """Adds the given dictionary (row) of values to the Agent's trajectory. + + Args: + values: Data dict (interpreted as a single row) to be added to buffer. + Must contain keys: + SampleBatch.ACTIONS, REWARDS, TERMINATEDS, TRUNCATEDS, and NEXT_OBS. + """ + if self.unroll_id is None: + self.unroll_id = AgentCollector._next_unroll_id + AgentCollector._next_unroll_id += 1 + + # Next obs -> obs. + values = copy.copy(input_values) + assert SampleBatch.OBS not in values + values[SampleBatch.OBS] = values[SampleBatch.NEXT_OBS] + del values[SampleBatch.NEXT_OBS] + + # convert obs to np.array (in case it is a list) + if isinstance(values[SampleBatch.OBS], list): + values[SampleBatch.OBS] = np.array(values[SampleBatch.OBS]) + + # Default to next timestep if not provided in input values + if SampleBatch.T not in input_values: + values[SampleBatch.T] = self.buffers[SampleBatch.T][0][-1] + 1 + + # Make sure EPS_ID/UNROLL_ID stay the same for this agent. + if SampleBatch.EPS_ID in values: + assert values[SampleBatch.EPS_ID] == self.episode_id + del values[SampleBatch.EPS_ID] + self.buffers[SampleBatch.EPS_ID][0].append(self.episode_id) + if SampleBatch.UNROLL_ID in values: + assert values[SampleBatch.UNROLL_ID] == self.unroll_id + del values[SampleBatch.UNROLL_ID] + self.buffers[SampleBatch.UNROLL_ID][0].append(self.unroll_id) + + for k, v in values.items(): + if k not in self.buffers: + if self.training and k.startswith("state_out"): + vr = self.view_requirements[k] + data_col = vr.data_col or k + self._fill_buffer_with_initial_values( + data_col, vr, build_for_inference=False + ) + else: + self._build_buffers({k: v}) + # Do not flatten infos, state_out and (if configured) actions. + # Infos/state-outs may be structs that change from timestep to + # timestep. + should_flatten_action_key = ( + k == SampleBatch.ACTIONS and not self.disable_action_flattening + ) + # Note (Artur) RL Modules's states need no flattening + should_flatten_state_key = ( + k.startswith("state_out") and not self._enable_new_api_stack + ) + if ( + k == SampleBatch.INFOS + or should_flatten_state_key + or should_flatten_action_key + ): + if should_flatten_action_key: + v = flatten_to_single_ndarray(v) + # Briefly remove dummy value to add to buffer + if k in self.data_cols_with_dummy_values: + dummy = self.buffers[k][0].pop(-1) + self.buffers[k][0].append(v) + # Add back dummy value + if k in self.data_cols_with_dummy_values: + self.buffers[k][0].append(dummy) + # Flatten all other columns. + else: + flattened = tree.flatten(v) + for i, sub_list in enumerate(self.buffers[k]): + # Briefly remove dummy value to add to buffer + if k in self.data_cols_with_dummy_values: + dummy = sub_list.pop(-1) + sub_list.append(flattened[i]) + # Add back dummy value + if k in self.data_cols_with_dummy_values: + sub_list.append(dummy) + + # In inference mode, we don't need to keep all of trajectory in memory + # we only need to keep the steps required. We can pop from the beginning to + # create room for new data. + if not self.training: + for k in self.buffers: + for sub_list in self.buffers[k]: + if sub_list: + sub_list.pop(0) + + self.agent_steps += 1 + + def build_for_inference(self) -> SampleBatch: + """During inference, we will build a SampleBatch with a batch size of 1 that + can then be used to run the forward pass of a policy. This data will only + include the enviornment context for running the policy at the last timestep. + + Returns: + A SampleBatch with a batch size of 1. + """ + + batch_data = {} + np_data = {} + for view_col, view_req in self.view_requirements.items(): + # Create the batch of data from the different buffers. + data_col = view_req.data_col or view_col + + # if this view is not for inference, skip it. + if not view_req.used_for_compute_actions: + continue + + if np.any(view_req.shift_arr > 0): + raise ValueError( + f"During inference the agent can only use past observations to " + f"respect causality. However, view_col = {view_col} seems to " + f"depend on future indices {view_req.shift_arr}, while the " + f"used_for_compute_actions flag is set to True. Please fix the " + f"discrepancy. Hint: If you are using a custom model make sure " + f"the view_requirements are initialized properly and is point " + f"only refering to past timesteps during inference." + ) + + # Some columns don't exist yet + # (get created during postprocessing or depend on state_out). + if data_col not in self.buffers: + self._fill_buffer_with_initial_values( + data_col, view_req, build_for_inference=True + ) + self._prepare_for_data_cols_with_dummy_values(data_col) + + # Keep an np-array cache, so we don't have to regenerate the + # np-array for different view_cols using to the same data_col. + self._cache_in_np(np_data, data_col) + + data = [] + for d in np_data[data_col]: + # if shift_arr = [0] the data will be just the last time step + # (len(d) - 1), if shift_arr = [-1] the data will be just the timestep + # before the last one (len(d) - 2) and so on. + element_at_t = d[view_req.shift_arr + len(d) - 1] + if element_at_t.shape[0] == 1: + # We'd normally squeeze here to remove the time dim, but we'll + # simply use the time dim as the batch dim. + data.append(element_at_t) + continue + # add the batch dimension with [None] + data.append(element_at_t[None]) + + # We unflatten even if data is empty here, because the structure might be + # nested with empty leafs and so we still need to reconstruct it. + # This is useful because we spec-check states in RLModules and these + # states can sometimes be nested dicts with empty leafs. + batch_data[view_col] = self._unflatten_as_buffer_struct(data, data_col) + + batch = self._get_sample_batch(batch_data) + return batch + + # TODO: @kouorsh we don't really need view_requirements anymore since it's already + # an attribute of the class + def build_for_training( + self, view_requirements: ViewRequirementsDict + ) -> SampleBatch: + """Builds a SampleBatch from the thus-far collected agent data. + + If the episode/trajectory has no TERMINATED|TRUNCATED=True at the end, will + copy the necessary n timesteps at the end of the trajectory back to the + beginning of the buffers and wait for new samples coming in. + SampleBatches created by this method will be ready for postprocessing + by a Policy. + + Args: + view_requirements: The viewrequirements dict needed to build the + SampleBatch from the raw buffers (which may have data shifts as well as + mappings from view-col to data-col in them). + + Returns: + SampleBatch: The built SampleBatch for this agent, ready to go into + postprocessing. + """ + batch_data = {} + np_data = {} + for view_col, view_req in view_requirements.items(): + # Create the batch of data from the different buffers. + data_col = view_req.data_col or view_col + + if data_col not in self.buffers: + is_state = self._fill_buffer_with_initial_values( + data_col, view_req, build_for_inference=False + ) + + # We need to skip this view_col if it does not exist in the buffers and + # is not an RNN state because it could be the special keys that gets + # added by policy's postprocessing function for training. + if not is_state: + continue + + # OBS and INFOS are already shifted by -1 (the initial obs/info starts one + # ts before all other data columns). + obs_shift = -1 if data_col in [SampleBatch.OBS, SampleBatch.INFOS] else 0 + + # Keep an np-array cache so we don't have to regenerate the + # np-array for different view_cols using to the same data_col. + self._cache_in_np(np_data, data_col) + + # Go through each time-step in the buffer and construct the view + # accordingly. + data = [] + for d in np_data[data_col]: + shifted_data = [] + + # batch_repeat_value determines how many time steps should we skip + # before we repeat indexing the data. + # Example: batch_repeat_value=10, shift_arr = [-3, -2, -1], + # shift_before = 3 + # buffer = [-3, -2, -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12] + # resulting_data = [[-3, -2, -1], [7, 8, 9]] + # explanation: For t=0, we output [-3, -2, -1]. We then skip 10 time + # steps ahead and get to t=10. For t=10, we output [7, 8, 9]. We skip + # 10 more time steps and get to t=20. but since t=20 is out of bound we + # stop. + + # count computes the number of time steps that we need to consider. + # if batch_repeat_value = 1, this number should be the length of + # episode so far, which is len(buffer) - shift_before (-1 if this + # value was gained during inference. This is because we keep a dummy + # value at the last position of the buffer that makes it one longer). + count = int( + math.ceil( + ( + len(d) + - int(data_col in self.data_cols_with_dummy_values) + - self.shift_before + ) + / view_req.batch_repeat_value + ) + ) + for i in range(count): + # the indices for time step t + inds = ( + self.shift_before + + obs_shift + + view_req.shift_arr + + (i * view_req.batch_repeat_value) + ) + + # handle the case where the inds are out of bounds from the end. + # if during the indexing any of the indices are out of bounds, we + # need to use padding on the end to fill in the missing indices. + # Create padding first time we encounter data + if max(inds) < len(d): + # Simple case where we can simply pick slices from buffer + element_at_t = d[inds] + else: + # Case in which we have to pad because buffer has insufficient + # length. This branch takes more time than simply picking + # slices we try to avoid it. + element_at_t = _get_buffered_slice_with_paddings(d, inds) + element_at_t = np.stack(element_at_t) + + if element_at_t.shape[0] == 1: + # Remove the T dimension if it is 1. + element_at_t = element_at_t[0] + shifted_data.append(element_at_t) + + # in some multi-agent cases shifted_data may be an empty list. + # In this case we should just create an empty array and return it. + if shifted_data: + shifted_data_np = np.stack(shifted_data, 0) + else: + shifted_data_np = np.array(shifted_data) + data.append(shifted_data_np) + + # We unflatten even if data is empty here, because the structure might be + # nested with empty leafs and so we still need to reconstruct it. + # This is useful because we spec-check states in RLModules and these + # states can sometimes be nested dicts with empty leafs. + batch_data[view_col] = self._unflatten_as_buffer_struct(data, data_col) + + batch = self._get_sample_batch(batch_data) + + # This trajectory is continuing -> Copy data at the end (in the size of + # self.shift_before) to the beginning of buffers and erase everything + # else. + if ( + SampleBatch.TERMINATEDS in self.buffers + and not self.buffers[SampleBatch.TERMINATEDS][0][-1] + and SampleBatch.TRUNCATEDS in self.buffers + and not self.buffers[SampleBatch.TRUNCATEDS][0][-1] + ): + # Copy data to beginning of buffer and cut lists. + if self.shift_before > 0: + for k, data in self.buffers.items(): + # Loop through + for i in range(len(data)): + self.buffers[k][i] = data[i][-self.shift_before :] + self.agent_steps = 0 + + # Reset our unroll_id. + self.unroll_id = None + + return batch + + def _build_buffers(self, single_row: Dict[str, TensorType]) -> None: + """Builds the buffers for sample collection, given an example data row. + + Args: + single_row (Dict[str, TensorType]): A single row (keys=column + names) of data to base the buffers on. + """ + for col, data in single_row.items(): + if col in self.buffers: + continue + + shift = self.shift_before - ( + 1 + if col + in [ + SampleBatch.OBS, + SampleBatch.INFOS, + SampleBatch.EPS_ID, + SampleBatch.AGENT_INDEX, + SampleBatch.ENV_ID, + SampleBatch.T, + SampleBatch.UNROLL_ID, + ] + else 0 + ) + + # Store all data as flattened lists, except INFOS and state-out + # lists. These are monolithic items (infos is a dict that + # should not be further split, same for state-out items, which + # could be custom dicts as well). + should_flatten_action_key = ( + col == SampleBatch.ACTIONS and not self.disable_action_flattening + ) + # Note (Artur) RL Modules's states need no flattening + should_flatten_state_key = ( + col.startswith("state_out") and not self._enable_new_api_stack + ) + if ( + col == SampleBatch.INFOS + or should_flatten_state_key + or should_flatten_action_key + ): + if should_flatten_action_key: + data = flatten_to_single_ndarray(data) + self.buffers[col] = [[data for _ in range(shift)]] + else: + self.buffers[col] = [ + [v for _ in range(shift)] for v in tree.flatten(data) + ] + # Store an example data struct so we know, how to unflatten + # each data col. + self.buffer_structs[col] = data + + def _get_sample_batch(self, batch_data: Dict[str, TensorType]) -> SampleBatch: + """Returns a SampleBatch from the given data dictionary. Also updates the + sequence information based on the max_seq_len.""" + + # Due to possible batch-repeats > 1, columns in the resulting batch + # may not all have the same batch size. + batch = SampleBatch(batch_data, is_training=self.training) + + # Adjust the seq-lens array depending on the incoming agent sequences. + if self.is_policy_recurrent: + seq_lens = [] + max_seq_len = self.max_seq_len + count = batch.count + while count > 0: + seq_lens.append(min(count, max_seq_len)) + count -= max_seq_len + batch["seq_lens"] = np.array(seq_lens) + batch.max_seq_len = max_seq_len + + return batch + + def _cache_in_np(self, cache_dict: Dict[str, List[np.ndarray]], key: str) -> None: + """Caches the numpy version of the key in the buffer dict.""" + if key not in cache_dict: + cache_dict[key] = [_to_float_np_array(d) for d in self.buffers[key]] + + def _unflatten_as_buffer_struct( + self, data: List[np.ndarray], key: str + ) -> np.ndarray: + """Unflattens the given to match the buffer struct format for that key.""" + if key not in self.buffer_structs: + return data[0] + + return tree.unflatten_as(self.buffer_structs[key], data) + + def _fill_buffer_with_initial_values( + self, + data_col: str, + view_requirement: ViewRequirement, + build_for_inference: bool = False, + ) -> bool: + """Fills the buffer with the initial values for the given data column. + for dat_col starting with `state_out`, use the initial states of the policy, + but for other data columns, create a dummy value based on the view requirement + space. + + Args: + data_col: The data column to fill the buffer with. + view_requirement: The view requirement for the view_col. Normally the view + requirement for the data column is used and if it does not exist for + some reason the view requirement for view column is used instead. + build_for_inference: Whether this is getting called for inference or not. + + returns: + is_state: True if the data_col is an RNN state, False otherwise. + """ + try: + space = self.view_requirements[data_col].space + except KeyError: + space = view_requirement.space + + # special treatment for state_out + # add them to the buffer in case they don't exist yet + is_state = True + if data_col.startswith("state_out"): + if self._enable_new_api_stack: + self._build_buffers({data_col: self.initial_states}) + else: + if not self.is_policy_recurrent: + raise ValueError( + f"{data_col} is not available, because the given policy is" + f"not recurrent according to the input model_inital_states." + f"Have you forgotten to return non-empty lists in" + f"policy.get_initial_states()?" + ) + state_ind = int(data_col.split("_")[-1]) + self._build_buffers({data_col: self.initial_states[state_ind]}) + else: + is_state = False + # only create dummy data during inference + if build_for_inference: + if isinstance(space, Space): + # state_out assumes the values do not have a batch dimension + # (i.e. instead of being (1, d) it is of shape (d,). + fill_value = get_dummy_batch_for_space( + space, + batch_size=0, + ) + else: + fill_value = space + + self._build_buffers({data_col: fill_value}) + + return is_state + + def _prepare_for_data_cols_with_dummy_values(self, data_col): + self.data_cols_with_dummy_values.add(data_col) + # For items gained during inference, we append a dummy value here so + # that view requirements viewing these is not shifted by 1 + for b in self.buffers[data_col]: + b.append(b[-1]) diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/evaluation/collectors/sample_collector.py b/.venv/lib/python3.11/site-packages/ray/rllib/evaluation/collectors/sample_collector.py new file mode 100644 index 0000000000000000000000000000000000000000..75dbb5d040a5a11e042a8d0cd18ca640f828eefd --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/rllib/evaluation/collectors/sample_collector.py @@ -0,0 +1,298 @@ +import logging +from abc import ABCMeta, abstractmethod +from typing import TYPE_CHECKING, Dict, List, Optional, Union + +from ray.rllib.policy.policy_map import PolicyMap +from ray.rllib.policy.sample_batch import MultiAgentBatch, SampleBatch +from ray.rllib.utils.annotations import OldAPIStack +from ray.rllib.utils.typing import AgentID, EnvID, EpisodeID, PolicyID, TensorType + +if TYPE_CHECKING: + from ray.rllib.callbacks.callbacks import RLlibCallback + +logger = logging.getLogger(__name__) + + +# fmt: off +# __sphinx_doc_begin__ +@OldAPIStack +class SampleCollector(metaclass=ABCMeta): + """Collects samples for all policies and agents from a multi-agent env. + + This API is controlled by RolloutWorker objects to store all data + generated by Environments and Policies/Models during rollout and + postprocessing. It's purposes are to a) make data collection and + SampleBatch/input_dict generation from this data faster, b) to unify + the way we collect samples from environments and model (outputs), thereby + allowing for possible user customizations, c) to allow for more complex + inputs fed into different policies (e.g. multi-agent case with inter-agent + communication channel). + """ + + def __init__(self, + policy_map: PolicyMap, + clip_rewards: Union[bool, float], + callbacks: "RLlibCallback", + multiple_episodes_in_batch: bool = True, + rollout_fragment_length: int = 200, + count_steps_by: str = "env_steps"): + """Initializes a SampleCollector instance. + + Args: + policy_map: Maps policy ids to policy instances. + clip_rewards (Union[bool, float]): Whether to clip rewards before + postprocessing (at +/-1.0) or the actual value to +/- clip. + callbacks: RLlib callbacks. + multiple_episodes_in_batch: Whether it's allowed to pack + multiple episodes into the same built batch. + rollout_fragment_length: The + + """ + + self.policy_map = policy_map + self.clip_rewards = clip_rewards + self.callbacks = callbacks + self.multiple_episodes_in_batch = multiple_episodes_in_batch + self.rollout_fragment_length = rollout_fragment_length + self.count_steps_by = count_steps_by + + @abstractmethod + def add_init_obs( + self, + *, + episode, + agent_id: AgentID, + policy_id: PolicyID, + init_obs: TensorType, + init_infos: Optional[Dict[str, TensorType]] = None, + t: int = -1, + ) -> None: + """Adds an initial obs (after reset) to this collector. + + Since the very first observation in an environment is collected w/o + additional data (w/o actions, w/o reward) after env.reset() is called, + this method initializes a new trajectory for a given agent. + `add_init_obs()` has to be called first for each agent/episode-ID + combination. After this, only `add_action_reward_next_obs()` must be + called for that same agent/episode-pair. + + Args: + episode: The Episode, for which we + are adding an Agent's initial observation. + agent_id: Unique id for the agent we are adding + values for. + env_id: The environment index (in a vectorized setup). + policy_id: Unique id for policy controlling the agent. + init_obs: Initial observation (after env.reset()). + init_obs: Initial observation (after env.reset()). + init_infos: Initial infos dict (after env.reset()). + t: The time step (episode length - 1). The initial obs has + ts=-1(!), then an action/reward/next-obs at t=0, etc.. + + .. testcode:: + :skipif: True + + obs, infos = env.reset() + collector.add_init_obs( + episode=my_episode, + agent_id=0, + policy_id="pol0", + t=-1, + init_obs=obs, + init_infos=infos, + ) + obs, r, terminated, truncated, info = env.step(action) + collector.add_action_reward_next_obs(12345, 0, "pol0", False, { + "action": action, "obs": obs, "reward": r, "terminated": terminated, + "truncated": truncated, "info": info + }) + """ + raise NotImplementedError + + @abstractmethod + def add_action_reward_next_obs( + self, + episode_id: EpisodeID, + agent_id: AgentID, + env_id: EnvID, + policy_id: PolicyID, + agent_done: bool, + values: Dict[str, TensorType], + ) -> None: + """Add the given dictionary (row) of values to this collector. + + The incoming data (`values`) must include action, reward, terminated, truncated, + and next_obs information and may include any other information. + For the initial observation (after Env.reset()) of the given agent/ + episode-ID combination, `add_initial_obs()` must be called instead. + + Args: + episode_id: Unique id for the episode we are adding + values for. + agent_id: Unique id for the agent we are adding + values for. + env_id: The environment index (in a vectorized setup). + policy_id: Unique id for policy controlling the agent. + agent_done: Whether the given agent is done (terminated or truncated) with + its trajectory (the multi-agent episode may still be ongoing). + values (Dict[str, TensorType]): Row of values to add for this + agent. This row must contain the keys SampleBatch.ACTION, + REWARD, NEW_OBS, TERMINATED, and TRUNCATED. + + .. testcode:: + :skipif: True + + obs, info = env.reset() + collector.add_init_obs(12345, 0, "pol0", obs) + obs, r, terminated, truncated, info = env.step(action) + collector.add_action_reward_next_obs( + 12345, + 0, + "pol0", + agent_done=False, + values={ + "action": action, "obs": obs, "reward": r, + "terminated": terminated, "truncated": truncated + }, + ) + """ + raise NotImplementedError + + @abstractmethod + def episode_step(self, episode) -> None: + """Increases the episode step counter (across all agents) by one. + + Args: + episode: Episode we are stepping through. + Useful for handling counting b/c it is called once across + all agents that are inside this episode. + """ + raise NotImplementedError + + @abstractmethod + def total_env_steps(self) -> int: + """Returns total number of env-steps taken so far. + + Thereby, a step in an N-agent multi-agent environment counts as only 1 + for this metric. The returned count contains everything that has not + been built yet (and returned as MultiAgentBatches by the + `try_build_truncated_episode_multi_agent_batch` or + `postprocess_episode(build=True)` methods). After such build, this + counter is reset to 0. + + Returns: + int: The number of env-steps taken in total in the environment(s) + so far. + """ + raise NotImplementedError + + @abstractmethod + def total_agent_steps(self) -> int: + """Returns total number of (individual) agent-steps taken so far. + + Thereby, a step in an N-agent multi-agent environment counts as N. + If less than N agents have stepped (because some agents were not + required to send actions), the count will be increased by less than N. + The returned count contains everything that has not been built yet + (and returned as MultiAgentBatches by the + `try_build_truncated_episode_multi_agent_batch` or + `postprocess_episode(build=True)` methods). After such build, this + counter is reset to 0. + + Returns: + int: The number of (individual) agent-steps taken in total in the + environment(s) so far. + """ + raise NotImplementedError + + # TODO(jungong) : Remove this API call once we completely move to + # connector based sample collection. + @abstractmethod + def get_inference_input_dict(self, policy_id: PolicyID) -> \ + Dict[str, TensorType]: + """Returns an input_dict for an (inference) forward pass from our data. + + The input_dict can then be used for action computations inside a + Policy via `Policy.compute_actions_from_input_dict()`. + + Args: + policy_id: The Policy ID to get the input dict for. + + Returns: + Dict[str, TensorType]: The input_dict to be passed into the ModelV2 + for inference/training. + + .. testcode:: + :skipif: True + + obs, r, terminated, truncated, info = env.step(action) + collector.add_action_reward_next_obs(12345, 0, "pol0", False, { + "action": action, "obs": obs, "reward": r, + "terminated": terminated, "truncated", truncated + }) + input_dict = collector.get_inference_input_dict(policy.model) + action = policy.compute_actions_from_input_dict(input_dict) + # repeat + """ + raise NotImplementedError + + @abstractmethod + def postprocess_episode( + self, + episode, + is_done: bool = False, + check_dones: bool = False, + build: bool = False, + ) -> Optional[MultiAgentBatch]: + """Postprocesses all agents' trajectories in a given episode. + + Generates (single-trajectory) SampleBatches for all Policies/Agents and + calls Policy.postprocess_trajectory on each of these. Postprocessing + may happens in-place, meaning any changes to the viewed data columns + are directly reflected inside this collector's buffers. + Also makes sure that additional (newly created) data columns are + correctly added to the buffers. + + Args: + episode: The Episode object for which + to post-process data. + is_done: Whether the given episode is actually terminated + (all agents are terminated OR truncated). If True, the + episode will no longer be used/continued and we may need to + recycle/erase it internally. If a soft-horizon is hit, the + episode will continue to be used and `is_done` should be set + to False here. + check_dones: Whether we need to check that all agents' + trajectories have dones=True at the end. + build: Whether to build a MultiAgentBatch from the given + episode (and only that episode!) and return that + MultiAgentBatch. Used for batch_mode=`complete_episodes`. + + Returns: + Optional[MultiAgentBatch]: If `build` is True, the + SampleBatch or MultiAgentBatch built from `episode` (either + just from that episde or from the `_PolicyCollectorGroup` + in the `episode.batch_builder` property). + """ + raise NotImplementedError + + @abstractmethod + def try_build_truncated_episode_multi_agent_batch(self) -> \ + List[Union[MultiAgentBatch, SampleBatch]]: + """Tries to build an MA-batch, if `rollout_fragment_length` is reached. + + Any unprocessed data will be first postprocessed with a policy + postprocessor. + This is usually called to collect samples for policy training. + If not enough data has been collected yet (`rollout_fragment_length`), + returns an empty list. + + Returns: + List[Union[MultiAgentBatch, SampleBatch]]: Returns a (possibly + empty) list of MultiAgentBatches (containing the accumulated + SampleBatches for each policy or a simple SampleBatch if only + one policy). The list will be empty if + `self.rollout_fragment_length` has not been reached yet. + """ + raise NotImplementedError +# __sphinx_doc_end__ diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/models/torch/__pycache__/complex_input_net.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/models/torch/__pycache__/complex_input_net.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..1a6f376f06cdaa3ca8b2ec20eb536f147a96c860 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/models/torch/__pycache__/complex_input_net.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/models/torch/__pycache__/torch_modelv2.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/models/torch/__pycache__/torch_modelv2.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..2a3832b1a02860ce538b733b58a8243803b60c6f Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/models/torch/__pycache__/torch_modelv2.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/models/torch/modules/__init__.py b/.venv/lib/python3.11/site-packages/ray/rllib/models/torch/modules/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..2585dcc77abe4acd4cd6daf49f902a4117a1438c --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/rllib/models/torch/modules/__init__.py @@ -0,0 +1,13 @@ +from ray.rllib.models.torch.modules.gru_gate import GRUGate +from ray.rllib.models.torch.modules.multi_head_attention import MultiHeadAttention +from ray.rllib.models.torch.modules.relative_multi_head_attention import ( + RelativeMultiHeadAttention, +) +from ray.rllib.models.torch.modules.skip_connection import SkipConnection + +__all__ = [ + "GRUGate", + "RelativeMultiHeadAttention", + "SkipConnection", + "MultiHeadAttention", +] diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/models/torch/modules/__pycache__/__init__.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/models/torch/modules/__pycache__/__init__.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..9f3d4b594e9b1564ef533d30378699b76a21af9c Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/models/torch/modules/__pycache__/__init__.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/models/torch/modules/__pycache__/gru_gate.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/models/torch/modules/__pycache__/gru_gate.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..917dd476eb69d0c42b9edf3afb833d9a3f308e8e Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/models/torch/modules/__pycache__/gru_gate.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/models/torch/modules/__pycache__/multi_head_attention.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/models/torch/modules/__pycache__/multi_head_attention.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..1e6602e0c0fcf30bc13ce25abafd1a3e8d2c2916 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/models/torch/modules/__pycache__/multi_head_attention.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/models/torch/modules/__pycache__/noisy_layer.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/models/torch/modules/__pycache__/noisy_layer.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..8e985f98126a4489651e70e023beb2962a4e52db Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/models/torch/modules/__pycache__/noisy_layer.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/models/torch/modules/__pycache__/relative_multi_head_attention.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/models/torch/modules/__pycache__/relative_multi_head_attention.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..9bed8beed66cddd8a4948e908f6549159abc9121 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/models/torch/modules/__pycache__/relative_multi_head_attention.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/models/torch/modules/__pycache__/skip_connection.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/models/torch/modules/__pycache__/skip_connection.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..ef20490676b30b24725529e07930a564df135ad0 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/models/torch/modules/__pycache__/skip_connection.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/models/torch/modules/skip_connection.py b/.venv/lib/python3.11/site-packages/ray/rllib/models/torch/modules/skip_connection.py new file mode 100644 index 0000000000000000000000000000000000000000..444c1680686153b04d44abe61fb581b94fbc49b3 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/rllib/models/torch/modules/skip_connection.py @@ -0,0 +1,43 @@ +from ray.rllib.utils.annotations import OldAPIStack +from ray.rllib.utils.framework import try_import_torch +from ray.rllib.utils.typing import TensorType +from typing import Optional + +torch, nn = try_import_torch() + + +@OldAPIStack +class SkipConnection(nn.Module): + """Skip connection layer. + + Adds the original input to the output (regular residual layer) OR uses + input as hidden state input to a given fan_in_layer. + """ + + def __init__( + self, layer: nn.Module, fan_in_layer: Optional[nn.Module] = None, **kwargs + ): + """Initializes a SkipConnection nn Module object. + + Args: + layer (nn.Module): Any layer processing inputs. + fan_in_layer (Optional[nn.Module]): An optional + layer taking two inputs: The original input and the output + of `layer`. + """ + super().__init__(**kwargs) + self._layer = layer + self._fan_in_layer = fan_in_layer + + def forward(self, inputs: TensorType, **kwargs) -> TensorType: + # del kwargs + outputs = self._layer(inputs, **kwargs) + # Residual case, just add inputs to outputs. + if self._fan_in_layer is None: + outputs = outputs + inputs + # Fan-in e.g. RNN: Call fan-in with `inputs` and `outputs`. + else: + # NOTE: In the GRU case, `inputs` is the state input. + outputs = self._fan_in_layer((inputs, outputs)) + + return outputs diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/offline/__pycache__/__init__.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/offline/__pycache__/__init__.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..1ec50561a0adbcf99b7e44bb331a3f14dc2017c1 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/offline/__pycache__/__init__.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/offline/__pycache__/d4rl_reader.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/offline/__pycache__/d4rl_reader.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..45b14dce734bf8f63596dc175b4a48e748938a5a Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/offline/__pycache__/d4rl_reader.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/offline/__pycache__/input_reader.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/offline/__pycache__/input_reader.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..c7769c87788d41c1075fc8063ef71dab91397f3a Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/offline/__pycache__/input_reader.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/offline/__pycache__/resource.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/offline/__pycache__/resource.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..df7df56ef78ef608fe43ed4c4912a019bfe56a2a Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/offline/__pycache__/resource.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/offline/__pycache__/shuffled_input.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/offline/__pycache__/shuffled_input.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..bd269900e67c8d76b975ba8834a9fe7d7d33fd37 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/offline/__pycache__/shuffled_input.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/offline/estimators/__init__.py b/.venv/lib/python3.11/site-packages/ray/rllib/offline/estimators/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..74131faf3eb6d98c81ea97aa064bc991c01afe96 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/rllib/offline/estimators/__init__.py @@ -0,0 +1,15 @@ +from ray.rllib.offline.estimators.importance_sampling import ImportanceSampling +from ray.rllib.offline.estimators.weighted_importance_sampling import ( + WeightedImportanceSampling, +) +from ray.rllib.offline.estimators.direct_method import DirectMethod +from ray.rllib.offline.estimators.doubly_robust import DoublyRobust +from ray.rllib.offline.estimators.off_policy_estimator import OffPolicyEstimator + +__all__ = [ + "OffPolicyEstimator", + "ImportanceSampling", + "WeightedImportanceSampling", + "DirectMethod", + "DoublyRobust", +] diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/offline/estimators/__pycache__/doubly_robust.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/offline/estimators/__pycache__/doubly_robust.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..8b9fad3da5c1d272f74dbb85195a700c8e15fe4e Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/offline/estimators/__pycache__/doubly_robust.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/offline/estimators/__pycache__/feature_importance.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/offline/estimators/__pycache__/feature_importance.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..9cebb4b9951b3bcb1c53df636016cb1a057e889c Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/offline/estimators/__pycache__/feature_importance.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/offline/estimators/__pycache__/importance_sampling.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/offline/estimators/__pycache__/importance_sampling.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..140e177e18fea63260f0d135c8f74d2205100c43 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/offline/estimators/__pycache__/importance_sampling.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/offline/estimators/__pycache__/weighted_importance_sampling.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/offline/estimators/__pycache__/weighted_importance_sampling.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..905ea71009e637691666e0a28c7d08048492af62 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/offline/estimators/__pycache__/weighted_importance_sampling.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/offline/estimators/direct_method.py b/.venv/lib/python3.11/site-packages/ray/rllib/offline/estimators/direct_method.py new file mode 100644 index 0000000000000000000000000000000000000000..c735b93a5e1b23b5217e8d2f1eec39c58ef4c2c2 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/rllib/offline/estimators/direct_method.py @@ -0,0 +1,180 @@ +import logging +from typing import Dict, Any, Optional, List +import math +import numpy as np + +from ray.data import Dataset + +from ray.rllib.offline.estimators.off_policy_estimator import OffPolicyEstimator +from ray.rllib.offline.offline_evaluation_utils import compute_q_and_v_values +from ray.rllib.offline.offline_evaluator import OfflineEvaluator +from ray.rllib.offline.estimators.fqe_torch_model import FQETorchModel +from ray.rllib.policy import Policy +from ray.rllib.policy.sample_batch import convert_ma_batch_to_sample_batch +from ray.rllib.policy.sample_batch import SampleBatch +from ray.rllib.utils.annotations import DeveloperAPI, override +from ray.rllib.utils.typing import SampleBatchType +from ray.rllib.utils.numpy import convert_to_numpy + +logger = logging.getLogger() + + +@DeveloperAPI +class DirectMethod(OffPolicyEstimator): + r"""The Direct Method estimator. + + Let s_t, a_t, and r_t be the state, action, and reward at timestep t. + + This method trains a Q-model for the evaluation policy \pi_e on behavior + data generated by \pi_b. Currently, RLlib implements this using + Fitted-Q Evaluation (FQE). You can also implement your own model + and pass it in as `q_model_config = {"type": your_model_class, **your_kwargs}`. + + This estimator computes the expected return for \pi_e for an episode as: + V^{\pi_e}(s_0) = \sum_{a \in A} \pi_e(a | s_0) Q(s_0, a) + and returns the mean and standard deviation over episodes. + + For more information refer to https://arxiv.org/pdf/1911.06854.pdf""" + + @override(OffPolicyEstimator) + def __init__( + self, + policy: Policy, + gamma: float, + epsilon_greedy: float = 0.0, + q_model_config: Optional[Dict] = None, + ): + """Initializes a Direct Method OPE Estimator. + + Args: + policy: Policy to evaluate. + gamma: Discount factor of the environment. + epsilon_greedy: The probability by which we act acording to a fully random + policy during deployment. With 1-epsilon_greedy we act according the + target policy. + q_model_config: Arguments to specify the Q-model. Must specify + a `type` key pointing to the Q-model class. + This Q-model is trained in the train() method and is used + to compute the state-value estimates for the DirectMethod estimator. + It must implement `train` and `estimate_v`. + TODO (Rohan138): Unify this with RLModule API. + """ + + super().__init__(policy, gamma, epsilon_greedy) + + # Some dummy policies and ones that are not based on a tensor framework + # backend can come without a config or without a framework key. + if hasattr(policy, "config"): + assert ( + policy.config.get("framework", "torch") == "torch" + ), "Framework must be torch to use DirectMethod." + + q_model_config = q_model_config or {} + model_cls = q_model_config.pop("type", FQETorchModel) + self.model = model_cls( + policy=policy, + gamma=gamma, + **q_model_config, + ) + assert hasattr( + self.model, "estimate_v" + ), "self.model must implement `estimate_v`!" + + @override(OffPolicyEstimator) + def estimate_on_single_episode(self, episode: SampleBatch) -> Dict[str, Any]: + estimates_per_epsiode = {} + rewards = episode["rewards"] + + v_behavior = 0.0 + for t in range(episode.count): + v_behavior += rewards[t] * self.gamma**t + + v_target = self._compute_v_target(episode[:1]) + + estimates_per_epsiode["v_behavior"] = v_behavior + estimates_per_epsiode["v_target"] = v_target + + return estimates_per_epsiode + + @override(OffPolicyEstimator) + def estimate_on_single_step_samples( + self, batch: SampleBatch + ) -> Dict[str, List[float]]: + estimates_per_epsiode = {} + rewards = batch["rewards"] + + v_behavior = rewards + v_target = self._compute_v_target(batch) + + estimates_per_epsiode["v_behavior"] = v_behavior + estimates_per_epsiode["v_target"] = v_target + + return estimates_per_epsiode + + def _compute_v_target(self, init_step): + v_target = self.model.estimate_v(init_step) + v_target = convert_to_numpy(v_target) + return v_target + + @override(OffPolicyEstimator) + def train(self, batch: SampleBatchType) -> Dict[str, Any]: + """Trains self.model on the given batch. + + Args: + batch: A SampleBatchType to train on + + Returns: + A dict with key "loss" and value as the mean training loss. + """ + batch = convert_ma_batch_to_sample_batch(batch) + losses = self.model.train(batch) + return {"loss": np.mean(losses)} + + @override(OfflineEvaluator) + def estimate_on_dataset( + self, dataset: Dataset, *, n_parallelism: int = ... + ) -> Dict[str, Any]: + """Calculates the Direct Method estimate on the given dataset. + + Note: This estimate works for only discrete action spaces for now. + + Args: + dataset: Dataset to compute the estimate on. Each record in dataset should + include the following columns: `obs`, `actions`, `action_prob` and + `rewards`. The `obs` on each row shoud be a vector of D dimensions. + n_parallelism: The number of parallel workers to use. + + Returns: + Dictionary with the following keys: + v_target: The estimated value of the target policy. + v_behavior: The estimated value of the behavior policy. + v_gain: The estimated gain of the target policy over the behavior + policy. + v_std: The standard deviation of the estimated value of the target. + """ + # compute v_values + batch_size = max(dataset.count() // n_parallelism, 1) + updated_ds = dataset.map_batches( + compute_q_and_v_values, + batch_size=batch_size, + batch_format="pandas", + fn_kwargs={ + "model_class": self.model.__class__, + "model_state": self.model.get_state(), + "compute_q_values": False, + }, + ) + + v_behavior = updated_ds.mean("rewards") + v_target = updated_ds.mean("v_values") + v_gain_mean = v_target / v_behavior + v_gain_ste = ( + updated_ds.std("v_values") / v_behavior / math.sqrt(dataset.count()) + ) + + return { + "v_behavior": v_behavior, + "v_target": v_target, + "v_gain_mean": v_gain_mean, + "v_gain_ste": v_gain_ste, + } diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/offline/estimators/doubly_robust.py b/.venv/lib/python3.11/site-packages/ray/rllib/offline/estimators/doubly_robust.py new file mode 100644 index 0000000000000000000000000000000000000000..d98028023660612e329dc1b555a2ba8151078bc2 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/rllib/offline/estimators/doubly_robust.py @@ -0,0 +1,253 @@ +import logging +import numpy as np +import math +import pandas as pd + +from typing import Dict, Any, Optional, List + +from ray.data import Dataset + +from ray.rllib.policy import Policy +from ray.rllib.policy.sample_batch import SampleBatch, convert_ma_batch_to_sample_batch +from ray.rllib.utils.annotations import DeveloperAPI, override +from ray.rllib.utils.typing import SampleBatchType +from ray.rllib.utils.numpy import convert_to_numpy + +from ray.rllib.offline.estimators.off_policy_estimator import OffPolicyEstimator +from ray.rllib.offline.estimators.fqe_torch_model import FQETorchModel +from ray.rllib.offline.offline_evaluator import OfflineEvaluator +from ray.rllib.offline.offline_evaluation_utils import ( + compute_is_weights, + compute_q_and_v_values, +) + +logger = logging.getLogger() + + +@DeveloperAPI +class DoublyRobust(OffPolicyEstimator): + """The Doubly Robust estimator. + + Let s_t, a_t, and r_t be the state, action, and reward at timestep t. + + This method trains a Q-model for the evaluation policy \pi_e on behavior + data generated by \pi_b. Currently, RLlib implements this using + Fitted-Q Evaluation (FQE). You can also implement your own model + and pass it in as `q_model_config = {"type": your_model_class, **your_kwargs}`. + + For behavior policy \pi_b and evaluation policy \pi_e, define the + cumulative importance ratio at timestep t as: + p_t = \sum_{t'=0}^t (\pi_e(a_{t'} | s_{t'}) / \pi_b(a_{t'} | s_{t'})). + + Consider an episode with length T. Let V_T = 0. + For all t in {0, T - 1}, use the following recursive update: + V_t^DR = (\sum_{a \in A} \pi_e(a | s_t) Q(s_t, a)) + + p_t * (r_t + \gamma * V_{t+1}^DR - Q(s_t, a_t)) + + This estimator computes the expected return for \pi_e for an episode as: + V^{\pi_e}(s_0) = V_0^DR + and returns the mean and standard deviation over episodes. + + For more information refer to https://arxiv.org/pdf/1911.06854.pdf""" + + @override(OffPolicyEstimator) + def __init__( + self, + policy: Policy, + gamma: float, + epsilon_greedy: float = 0.0, + normalize_weights: bool = True, + q_model_config: Optional[Dict] = None, + ): + """Initializes a Doubly Robust OPE Estimator. + + Args: + policy: Policy to evaluate. + gamma: Discount factor of the environment. + epsilon_greedy: The probability by which we act acording to a fully random + policy during deployment. With 1-epsilon_greedy we act + according the target policy. + normalize_weights: If True, the inverse propensity scores are normalized to + their sum across the entire dataset. The effect of this is similar to + weighted importance sampling compared to standard importance sampling. + q_model_config: Arguments to specify the Q-model. Must specify + a `type` key pointing to the Q-model class. + This Q-model is trained in the train() method and is used + to compute the state-value and Q-value estimates + for the DoublyRobust estimator. + It must implement `train`, `estimate_q`, and `estimate_v`. + TODO (Rohan138): Unify this with RLModule API. + """ + + super().__init__(policy, gamma, epsilon_greedy) + q_model_config = q_model_config or {} + q_model_config["gamma"] = gamma + + self._model_cls = q_model_config.pop("type", FQETorchModel) + self._model_configs = q_model_config + self._normalize_weights = normalize_weights + + self.model = self._model_cls( + policy=policy, + **q_model_config, + ) + assert hasattr( + self.model, "estimate_v" + ), "self.model must implement `estimate_v`!" + assert hasattr( + self.model, "estimate_q" + ), "self.model must implement `estimate_q`!" + + @override(OffPolicyEstimator) + def estimate_on_single_episode(self, episode: SampleBatch) -> Dict[str, Any]: + estimates_per_epsiode = {} + + rewards, old_prob = episode["rewards"], episode["action_prob"] + new_prob = self.compute_action_probs(episode) + + weight = new_prob / old_prob + + v_behavior = 0.0 + v_target = 0.0 + q_values = self.model.estimate_q(episode) + q_values = convert_to_numpy(q_values) + v_values = self.model.estimate_v(episode) + v_values = convert_to_numpy(v_values) + assert q_values.shape == v_values.shape == (episode.count,) + + for t in reversed(range(episode.count)): + v_behavior = rewards[t] + self.gamma * v_behavior + v_target = v_values[t] + weight[t] * ( + rewards[t] + self.gamma * v_target - q_values[t] + ) + v_target = v_target.item() + + estimates_per_epsiode["v_behavior"] = v_behavior + estimates_per_epsiode["v_target"] = v_target + + return estimates_per_epsiode + + @override(OffPolicyEstimator) + def estimate_on_single_step_samples( + self, batch: SampleBatch + ) -> Dict[str, List[float]]: + estimates_per_epsiode = {} + + rewards, old_prob = batch["rewards"], batch["action_prob"] + new_prob = self.compute_action_probs(batch) + + q_values = self.model.estimate_q(batch) + q_values = convert_to_numpy(q_values) + v_values = self.model.estimate_v(batch) + v_values = convert_to_numpy(v_values) + + v_behavior = rewards + + weight = new_prob / old_prob + v_target = v_values + weight * (rewards - q_values) + + estimates_per_epsiode["v_behavior"] = v_behavior + estimates_per_epsiode["v_target"] = v_target + + return estimates_per_epsiode + + @override(OffPolicyEstimator) + def train(self, batch: SampleBatchType) -> Dict[str, Any]: + """Trains self.model on the given batch. + + Args: + batch: A SampleBatch or MultiAgentbatch to train on + + Returns: + A dict with key "loss" and value as the mean training loss. + """ + batch = convert_ma_batch_to_sample_batch(batch) + losses = self.model.train(batch) + return {"loss": np.mean(losses)} + + @override(OfflineEvaluator) + def estimate_on_dataset( + self, dataset: Dataset, *, n_parallelism: int = ... + ) -> Dict[str, Any]: + """Estimates the policy value using the Doubly Robust estimator. + + The doubly robust estimator uses normalization of importance sampling weights + (aka. propensity ratios) to the average of the importance weights across the + entire dataset. This is done to reduce the variance of the estimate (similar to + weighted importance sampling). You can disable this by setting + `normalize_weights=False` in the constructor. + + Note: This estimate works for only discrete action spaces for now. + + Args: + dataset: Dataset to compute the estimate on. Each record in dataset should + include the following columns: `obs`, `actions`, `action_prob` and + `rewards`. The `obs` on each row shoud be a vector of D dimensions. + n_parallelism: Number of parallelism to use for the computation. + + Returns: + A dict with the following keys: + v_target: The estimated value of the target policy. + v_behavior: The estimated value of the behavior policy. + v_gain: The estimated gain of the target policy over the behavior + policy. + v_std: The standard deviation of the estimated value of the target. + """ + + # step 1: compute the weights and weighted rewards + batch_size = max(dataset.count() // n_parallelism, 1) + updated_ds = dataset.map_batches( + compute_is_weights, + batch_size=batch_size, + batch_format="pandas", + fn_kwargs={ + "policy_state": self.policy.get_state(), + "estimator_class": self.__class__, + }, + ) + + # step 2: compute q_values and v_values + batch_size = max(updated_ds.count() // n_parallelism, 1) + updated_ds = updated_ds.map_batches( + compute_q_and_v_values, + batch_size=batch_size, + batch_format="pandas", + fn_kwargs={ + "model_class": self.model.__class__, + "model_state": self.model.get_state(), + }, + ) + + # step 3: compute the v_target + def compute_v_target(batch: pd.DataFrame, normalizer: float = 1.0): + weights = batch["weights"] / normalizer + batch["v_target"] = batch["v_values"] + weights * ( + batch["rewards"] - batch["q_values"] + ) + batch["v_behavior"] = batch["rewards"] + return batch + + normalizer = updated_ds.mean("weights") if self._normalize_weights else 1.0 + updated_ds = updated_ds.map_batches( + compute_v_target, + batch_size=batch_size, + batch_format="pandas", + fn_kwargs={"normalizer": normalizer}, + ) + + v_behavior = updated_ds.mean("v_behavior") + v_target = updated_ds.mean("v_target") + v_gain_mean = v_target / v_behavior + v_gain_ste = ( + updated_ds.std("v_target") + / normalizer + / v_behavior + / math.sqrt(dataset.count()) + ) + + return { + "v_behavior": v_behavior, + "v_target": v_target, + "v_gain_mean": v_gain_mean, + "v_gain_ste": v_gain_ste, + } diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/offline/estimators/fqe_torch_model.py b/.venv/lib/python3.11/site-packages/ray/rllib/offline/estimators/fqe_torch_model.py new file mode 100644 index 0000000000000000000000000000000000000000..f071640a9afd380ad30f549113085f579cd86fa9 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/rllib/offline/estimators/fqe_torch_model.py @@ -0,0 +1,297 @@ +from typing import Dict, Any +from ray.rllib.models.utils import get_initializer +from ray.rllib.policy import Policy + +from ray.rllib.models.catalog import ModelCatalog +from ray.rllib.models.torch.torch_modelv2 import TorchModelV2 +from ray.rllib.policy.sample_batch import SampleBatch +from ray.rllib.utils.annotations import DeveloperAPI +from ray.rllib.utils.framework import try_import_torch +from ray.rllib.utils.annotations import is_overridden +from ray.rllib.utils.typing import ModelConfigDict, TensorType +from gymnasium.spaces import Discrete + +torch, nn = try_import_torch() + +# TODO: Create a config object for FQE and unify it with the RLModule API + + +@DeveloperAPI +class FQETorchModel: + """Pytorch implementation of the Fitted Q-Evaluation (FQE) model from + https://arxiv.org/abs/1911.06854 + """ + + def __init__( + self, + policy: Policy, + gamma: float, + model_config: ModelConfigDict = None, + n_iters: int = 1, + lr: float = 1e-3, + min_loss_threshold: float = 1e-4, + clip_grad_norm: float = 100.0, + minibatch_size: int = None, + polyak_coef: float = 1.0, + ) -> None: + """ + Args: + policy: Policy to evaluate. + gamma: Discount factor of the environment. + model_config: The ModelConfigDict for self.q_model, defaults to: + { + "fcnet_hiddens": [8, 8], + "fcnet_activation": "relu", + "vf_share_layers": True, + }, + n_iters: Number of gradient steps to run on batch, defaults to 1 + lr: Learning rate for Adam optimizer + min_loss_threshold: Early stopping if mean loss < min_loss_threshold + clip_grad_norm: Clip loss gradients to this maximum value + minibatch_size: Minibatch size for training Q-function; + if None, train on the whole batch + polyak_coef: Polyak averaging factor for target Q-function + """ + self.policy = policy + assert isinstance( + policy.action_space, Discrete + ), f"{self.__class__.__name__} only supports discrete action spaces!" + self.gamma = gamma + self.observation_space = policy.observation_space + self.action_space = policy.action_space + + if model_config is None: + model_config = { + "fcnet_hiddens": [32, 32, 32], + "fcnet_activation": "relu", + "vf_share_layers": True, + } + self.model_config = model_config + + self.device = self.policy.device + self.q_model: TorchModelV2 = ModelCatalog.get_model_v2( + self.observation_space, + self.action_space, + self.action_space.n, + model_config, + framework="torch", + name="TorchQModel", + ).to(self.device) + + self.target_q_model: TorchModelV2 = ModelCatalog.get_model_v2( + self.observation_space, + self.action_space, + self.action_space.n, + model_config, + framework="torch", + name="TargetTorchQModel", + ).to(self.device) + + self.n_iters = n_iters + self.lr = lr + self.min_loss_threshold = min_loss_threshold + self.clip_grad_norm = clip_grad_norm + self.minibatch_size = minibatch_size + self.polyak_coef = polyak_coef + self.optimizer = torch.optim.Adam(self.q_model.variables(), self.lr) + initializer = get_initializer("xavier_uniform", framework="torch") + # Hard update target + self.update_target(polyak_coef=1.0) + + def f(m): + if isinstance(m, nn.Linear): + initializer(m.weight) + + self.initializer = f + + def train(self, batch: SampleBatch) -> TensorType: + """Trains self.q_model using FQE loss on given batch. + + Args: + batch: A SampleBatch of episodes to train on + + Returns: + A list of losses for each training iteration + """ + losses = [] + minibatch_size = self.minibatch_size or batch.count + # Copy batch for shuffling + batch = batch.copy(shallow=True) + for _ in range(self.n_iters): + minibatch_losses = [] + batch.shuffle() + for idx in range(0, batch.count, minibatch_size): + minibatch = batch[idx : idx + minibatch_size] + obs = torch.tensor(minibatch[SampleBatch.OBS], device=self.device) + actions = torch.tensor( + minibatch[SampleBatch.ACTIONS], + device=self.device, + dtype=int, + ) + rewards = torch.tensor( + minibatch[SampleBatch.REWARDS], device=self.device + ) + next_obs = torch.tensor( + minibatch[SampleBatch.NEXT_OBS], device=self.device + ) + dones = torch.tensor( + minibatch[SampleBatch.TERMINATEDS], device=self.device, dtype=float + ) + + # Compute Q-values for current obs + q_values, _ = self.q_model({"obs": obs}, [], None) + q_acts = torch.gather(q_values, -1, actions.unsqueeze(-1)).squeeze(-1) + + next_action_probs = self._compute_action_probs(next_obs) + + # Compute Q-values for next obs + with torch.no_grad(): + next_q_values, _ = self.target_q_model({"obs": next_obs}, [], None) + + # Compute estimated state value next_v = E_{a ~ pi(s)} [Q(next_obs,a)] + next_v = torch.sum(next_q_values * next_action_probs, axis=-1) + targets = rewards + (1 - dones) * self.gamma * next_v + loss = (targets - q_acts) ** 2 + loss = torch.mean(loss) + self.optimizer.zero_grad() + loss.backward() + nn.utils.clip_grad.clip_grad_norm_( + self.q_model.variables(), self.clip_grad_norm + ) + self.optimizer.step() + minibatch_losses.append(loss.item()) + iter_loss = sum(minibatch_losses) / len(minibatch_losses) + losses.append(iter_loss) + if iter_loss < self.min_loss_threshold: + break + self.update_target() + return losses + + def estimate_q(self, batch: SampleBatch) -> TensorType: + obs = torch.tensor(batch[SampleBatch.OBS], device=self.device) + with torch.no_grad(): + q_values, _ = self.q_model({"obs": obs}, [], None) + actions = torch.tensor( + batch[SampleBatch.ACTIONS], device=self.device, dtype=int + ) + q_values = torch.gather(q_values, -1, actions.unsqueeze(-1)).squeeze(-1) + return q_values + + def estimate_v(self, batch: SampleBatch) -> TensorType: + obs = torch.tensor(batch[SampleBatch.OBS], device=self.device) + with torch.no_grad(): + q_values, _ = self.q_model({"obs": obs}, [], None) + # Compute pi(a | s) for each action a in policy.action_space + action_probs = self._compute_action_probs(obs) + v_values = torch.sum(q_values * action_probs, axis=-1) + return v_values + + def update_target(self, polyak_coef=None): + # Update_target will be called periodically to copy Q network to + # target Q network, using (soft) polyak_coef-synching. + polyak_coef = polyak_coef or self.polyak_coef + model_state_dict = self.q_model.state_dict() + # Support partial (soft) synching. + # If polyak_coef == 1.0: Full sync from Q-model to target Q-model. + target_state_dict = self.target_q_model.state_dict() + model_state_dict = { + k: polyak_coef * model_state_dict[k] + (1 - polyak_coef) * v + for k, v in target_state_dict.items() + } + + self.target_q_model.load_state_dict(model_state_dict) + + def _compute_action_probs(self, obs: TensorType) -> TensorType: + """Compute action distribution over the action space. + + Args: + obs: A tensor of observations of shape (batch_size * obs_dim) + + Returns: + action_probs: A tensor of action probabilities + of shape (batch_size * action_dim) + """ + input_dict = {SampleBatch.OBS: obs} + seq_lens = torch.ones(len(obs), device=self.device, dtype=int) + state_batches = [] + if is_overridden(self.policy.action_distribution_fn): + try: + # TorchPolicyV2 function signature + dist_inputs, dist_class, _ = self.policy.action_distribution_fn( + self.policy.model, + obs_batch=input_dict, + state_batches=state_batches, + seq_lens=seq_lens, + explore=False, + is_training=False, + ) + except TypeError: + # TorchPolicyV1 function signature for compatibility with DQN + # TODO: Remove this once DQNTorchPolicy is migrated to PolicyV2 + dist_inputs, dist_class, _ = self.policy.action_distribution_fn( + self.policy, + self.policy.model, + input_dict=input_dict, + state_batches=state_batches, + seq_lens=seq_lens, + explore=False, + is_training=False, + ) + else: + dist_class = self.policy.dist_class + dist_inputs, _ = self.policy.model(input_dict, state_batches, seq_lens) + action_dist = dist_class(dist_inputs, self.policy.model) + assert isinstance( + action_dist.dist, torch.distributions.categorical.Categorical + ), "FQE only supports Categorical or MultiCategorical distributions!" + action_probs = action_dist.dist.probs + return action_probs + + def get_state(self) -> Dict[str, Any]: + """Returns the current state of the FQE Model.""" + return { + "policy_state": self.policy.get_state(), + "model_config": self.model_config, + "n_iters": self.n_iters, + "lr": self.lr, + "min_loss_threshold": self.min_loss_threshold, + "clip_grad_norm": self.clip_grad_norm, + "minibatch_size": self.minibatch_size, + "polyak_coef": self.polyak_coef, + "gamma": self.gamma, + "q_model_state": self.q_model.state_dict(), + "target_q_model_state": self.target_q_model.state_dict(), + } + + def set_state(self, state: Dict[str, Any]) -> None: + """Sets the current state of the FQE Model. + Args: + state: A state dict returned by `get_state()`. + """ + self.n_iters = state["n_iters"] + self.lr = state["lr"] + self.min_loss_threshold = state["min_loss_threshold"] + self.clip_grad_norm = state["clip_grad_norm"] + self.minibatch_size = state["minibatch_size"] + self.polyak_coef = state["polyak_coef"] + self.gamma = state["gamma"] + self.policy.set_state(state["policy_state"]) + self.q_model.load_state_dict(state["q_model_state"]) + self.target_q_model.load_state_dict(state["target_q_model_state"]) + + @classmethod + def from_state(cls, state: Dict[str, Any]) -> "FQETorchModel": + """Creates a FQE Model from a state dict. + + Args: + state: A state dict returned by `get_state`. + + Returns: + An instance of the FQETorchModel. + """ + policy = Policy.from_state(state["policy_state"]) + model = cls( + policy=policy, gamma=state["gamma"], model_config=state["model_config"] + ) + model.set_state(state) + return model diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/offline/estimators/importance_sampling.py b/.venv/lib/python3.11/site-packages/ray/rllib/offline/estimators/importance_sampling.py new file mode 100644 index 0000000000000000000000000000000000000000..630859820948b7f1139ab27649f21096ce2f28e9 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/rllib/offline/estimators/importance_sampling.py @@ -0,0 +1,126 @@ +from typing import Dict, List, Any +import math + +from ray.data import Dataset + +from ray.rllib.utils.annotations import override, DeveloperAPI +from ray.rllib.offline.offline_evaluator import OfflineEvaluator +from ray.rllib.offline.offline_evaluation_utils import ( + remove_time_dim, + compute_is_weights, +) +from ray.rllib.offline.estimators.off_policy_estimator import OffPolicyEstimator +from ray.rllib.policy.sample_batch import SampleBatch + + +@DeveloperAPI +class ImportanceSampling(OffPolicyEstimator): + r"""The step-wise IS estimator. + + Let s_t, a_t, and r_t be the state, action, and reward at timestep t. + + For behavior policy \pi_b and evaluation policy \pi_e, define the + cumulative importance ratio at timestep t as: + p_t = \sum_{t'=0}^t (\pi_e(a_{t'} | s_{t'}) / \pi_b(a_{t'} | s_{t'})). + + This estimator computes the expected return for \pi_e for an episode as: + V^{\pi_e}(s_0) = \sum_t \gamma ^ {t} * p_t * r_t + and returns the mean and standard deviation over episodes. + + For more information refer to https://arxiv.org/pdf/1911.06854.pdf""" + + @override(OffPolicyEstimator) + def estimate_on_single_episode(self, episode: SampleBatch) -> Dict[str, float]: + estimates_per_epsiode = {} + + rewards, old_prob = episode["rewards"], episode["action_prob"] + new_prob = self.compute_action_probs(episode) + + # calculate importance ratios + p = [] + for t in range(episode.count): + if t == 0: + pt_prev = 1.0 + else: + pt_prev = p[t - 1] + p.append(pt_prev * new_prob[t] / old_prob[t]) + + # calculate stepwise IS estimate + v_behavior = 0.0 + v_target = 0.0 + for t in range(episode.count): + v_behavior += rewards[t] * self.gamma**t + v_target += p[t] * rewards[t] * self.gamma**t + + estimates_per_epsiode["v_behavior"] = v_behavior + estimates_per_epsiode["v_target"] = v_target + + return estimates_per_epsiode + + @override(OffPolicyEstimator) + def estimate_on_single_step_samples( + self, batch: SampleBatch + ) -> Dict[str, List[float]]: + estimates_per_epsiode = {} + + rewards, old_prob = batch["rewards"], batch["action_prob"] + new_prob = self.compute_action_probs(batch) + + weights = new_prob / old_prob + v_behavior = rewards + v_target = weights * rewards + + estimates_per_epsiode["v_behavior"] = v_behavior + estimates_per_epsiode["v_target"] = v_target + + return estimates_per_epsiode + + @override(OfflineEvaluator) + def estimate_on_dataset( + self, dataset: Dataset, *, n_parallelism: int = ... + ) -> Dict[str, Any]: + """Computes the Importance sampling estimate on the given dataset. + + Note: This estimate works for both continuous and discrete action spaces. + + Args: + dataset: Dataset to compute the estimate on. Each record in dataset should + include the following columns: `obs`, `actions`, `action_prob` and + `rewards`. The `obs` on each row shoud be a vector of D dimensions. + n_parallelism: The number of parallel workers to use. + + Returns: + A dictionary containing the following keys: + v_target: The estimated value of the target policy. + v_behavior: The estimated value of the behavior policy. + v_gain_mean: The mean of the gain of the target policy over the + behavior policy. + v_gain_ste: The standard error of the gain of the target policy over + the behavior policy. + """ + batch_size = max(dataset.count() // n_parallelism, 1) + dataset = dataset.map_batches( + remove_time_dim, batch_size=batch_size, batch_format="pandas" + ) + updated_ds = dataset.map_batches( + compute_is_weights, + batch_size=batch_size, + batch_format="pandas", + fn_kwargs={ + "policy_state": self.policy.get_state(), + "estimator_class": self.__class__, + }, + ) + v_target = updated_ds.mean("weighted_rewards") + v_behavior = updated_ds.mean("rewards") + v_gain_mean = v_target / v_behavior + v_gain_ste = ( + updated_ds.std("weighted_rewards") / v_behavior / math.sqrt(dataset.count()) + ) + + return { + "v_target": v_target, + "v_behavior": v_behavior, + "v_gain_mean": v_gain_mean, + "v_gain_ste": v_gain_ste, + } diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/offline/estimators/off_policy_estimator.py b/.venv/lib/python3.11/site-packages/ray/rllib/offline/estimators/off_policy_estimator.py new file mode 100644 index 0000000000000000000000000000000000000000..0de4f246130ecee290b94e0c4ea5ea5ae6a6d59c --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/rllib/offline/estimators/off_policy_estimator.py @@ -0,0 +1,248 @@ +import gymnasium as gym +import numpy as np +import tree +from typing import Dict, Any, List + +import logging +from ray.rllib.policy.sample_batch import SampleBatch +from ray.rllib.policy import Policy +from ray.rllib.policy.sample_batch import convert_ma_batch_to_sample_batch +from ray.rllib.utils.policy import compute_log_likelihoods_from_input_dict +from ray.rllib.utils.annotations import ( + DeveloperAPI, + ExperimentalAPI, + OverrideToImplementCustomLogic, +) +from ray.rllib.utils.deprecation import Deprecated +from ray.rllib.utils.numpy import convert_to_numpy +from ray.rllib.utils.typing import TensorType, SampleBatchType +from ray.rllib.offline.offline_evaluator import OfflineEvaluator + +logger = logging.getLogger(__name__) + + +@DeveloperAPI +class OffPolicyEstimator(OfflineEvaluator): + """Interface for an off policy estimator for counterfactual evaluation.""" + + @DeveloperAPI + def __init__( + self, + policy: Policy, + gamma: float = 0.0, + epsilon_greedy: float = 0.0, + ): + """Initializes an OffPolicyEstimator instance. + + Args: + policy: Policy to evaluate. + gamma: Discount factor of the environment. + epsilon_greedy: The probability by which we act acording to a fully random + policy during deployment. With 1-epsilon_greedy we act according the target + policy. + # TODO (kourosh): convert the input parameters to a config dict. + """ + super().__init__(policy) + self.gamma = gamma + self.epsilon_greedy = epsilon_greedy + + @DeveloperAPI + def estimate_on_single_episode(self, episode: SampleBatch) -> Dict[str, Any]: + """Returns off-policy estimates for the given one episode. + + Args: + batch: The episode to calculate the off-policy estimates (OPE) on. The + episode must be a sample batch type that contains the fields "obs", + "actions", and "action_prob" and it needs to represent a + complete trajectory. + + Returns: + The off-policy estimates (OPE) calculated on the given episode. The returned + dict can be any arbitrary mapping of strings to metrics. + """ + raise NotImplementedError + + @DeveloperAPI + def estimate_on_single_step_samples( + self, + batch: SampleBatch, + ) -> Dict[str, List[float]]: + """Returns off-policy estimates for the batch of single timesteps. This is + highly optimized for bandits assuming each episode is a single timestep. + + Args: + batch: The batch to calculate the off-policy estimates (OPE) on. The + batch must be a sample batch type that contains the fields "obs", + "actions", and "action_prob". + + Returns: + The off-policy estimates (OPE) calculated on the given batch of single time + step samples. The returned dict can be any arbitrary mapping of strings to + a list of floats capturing the values per each record. + """ + raise NotImplementedError + + def on_before_split_batch_by_episode( + self, sample_batch: SampleBatch + ) -> SampleBatch: + """Called before the batch is split by episode. You can perform any + preprocessing on the batch that you want here. + e.g. adding done flags to the batch, or reseting some stats that you want to + track per episode later during estimation, .etc. + + Args: + sample_batch: The batch to split by episode. This contains multiple + episodes. + + Returns: + The modified batch before calling split_by_episode(). + """ + return sample_batch + + @OverrideToImplementCustomLogic + def on_after_split_batch_by_episode( + self, all_episodes: List[SampleBatch] + ) -> List[SampleBatch]: + """Called after the batch is split by episode. You can perform any + postprocessing on each episode that you want here. + e.g. computing advantage per episode, .etc. + + Args: + all_episodes: The list of episodes in the original batch. Each element is a + sample batch type that is a single episode. + """ + + return all_episodes + + @OverrideToImplementCustomLogic + def peek_on_single_episode(self, episode: SampleBatch) -> None: + """This is called on each episode before it is passed to + estimate_on_single_episode(). Using this method, you can get a peek at the + entire validation dataset before runnining the estimation. For examlpe if you + need to perform any normalizations of any sorts on the dataset, you can compute + the normalization parameters here. + + Args: + episode: The episode that is split from the original batch. This is a + sample batch type that is a single episode. + """ + pass + + @DeveloperAPI + def estimate( + self, batch: SampleBatchType, split_batch_by_episode: bool = True + ) -> Dict[str, Any]: + """Compute off-policy estimates. + + Args: + batch: The batch to calculate the off-policy estimates (OPE) on. The + batch must contain the fields "obs", "actions", and "action_prob". + split_batch_by_episode: Whether to split the batch by episode. + + Returns: + The off-policy estimates (OPE) calculated on the given batch. The returned + dict can be any arbitrary mapping of strings to metrics. + The dict consists of the following metrics: + - v_behavior: The discounted return averaged over episodes in the batch + - v_behavior_std: The standard deviation corresponding to v_behavior + - v_target: The estimated discounted return for `self.policy`, + averaged over episodes in the batch + - v_target_std: The standard deviation corresponding to v_target + - v_gain: v_target / max(v_behavior, 1e-8) + - v_delta: The difference between v_target and v_behavior. + """ + batch = convert_ma_batch_to_sample_batch(batch) + self.check_action_prob_in_batch(batch) + estimates_per_epsiode = [] + if split_batch_by_episode: + batch = self.on_before_split_batch_by_episode(batch) + all_episodes = batch.split_by_episode() + all_episodes = self.on_after_split_batch_by_episode(all_episodes) + for episode in all_episodes: + assert len(set(episode[SampleBatch.EPS_ID])) == 1, ( + "The episode must contain only one episode id. For some reason " + "the split_by_episode() method could not successfully split " + "the batch by episodes. Each row in the dataset should be " + "one episode. Check your evaluation dataset for errors." + ) + self.peek_on_single_episode(episode) + + for episode in all_episodes: + estimate_step_results = self.estimate_on_single_episode(episode) + estimates_per_epsiode.append(estimate_step_results) + + # turn a list of identical dicts into a dict of lists + estimates_per_epsiode = tree.map_structure( + lambda *x: list(x), *estimates_per_epsiode + ) + else: + # the returned dict is a mapping of strings to a list of floats + estimates_per_epsiode = self.estimate_on_single_step_samples(batch) + + estimates = { + "v_behavior": np.mean(estimates_per_epsiode["v_behavior"]), + "v_behavior_std": np.std(estimates_per_epsiode["v_behavior"]), + "v_target": np.mean(estimates_per_epsiode["v_target"]), + "v_target_std": np.std(estimates_per_epsiode["v_target"]), + } + estimates["v_gain"] = estimates["v_target"] / max(estimates["v_behavior"], 1e-8) + estimates["v_delta"] = estimates["v_target"] - estimates["v_behavior"] + + return estimates + + @DeveloperAPI + def check_action_prob_in_batch(self, batch: SampleBatchType) -> None: + """Checks if we support off policy estimation (OPE) on given batch. + + Args: + batch: The batch to check. + + Raises: + ValueError: In case `action_prob` key is not in batch + """ + + if "action_prob" not in batch: + raise ValueError( + "Off-policy estimation is not possible unless the inputs " + "include action probabilities (i.e., the policy is stochastic " + "and emits the 'action_prob' key). For DQN this means using " + "`exploration_config: {type: 'SoftQ'}`. You can also set " + "`off_policy_estimation_methods: {}` to disable estimation." + ) + + @ExperimentalAPI + def compute_action_probs(self, batch: SampleBatch): + log_likelihoods = compute_log_likelihoods_from_input_dict(self.policy, batch) + new_prob = np.exp(convert_to_numpy(log_likelihoods)) + + if self.epsilon_greedy > 0.0: + if not isinstance(self.policy.action_space, gym.spaces.Discrete): + raise ValueError( + "Evaluation with epsilon-greedy exploration is only supported " + "with discrete action spaces." + ) + eps = self.epsilon_greedy + new_prob = new_prob * (1 - eps) + eps / self.policy.action_space.n + + return new_prob + + @DeveloperAPI + def train(self, batch: SampleBatchType) -> Dict[str, Any]: + """Train a model for Off-Policy Estimation. + + Args: + batch: SampleBatch to train on + + Returns: + Any optional metrics to return from the estimator + """ + return {} + + @Deprecated( + old="OffPolicyEstimator.action_log_likelihood", + new="ray.rllib.utils.policy.compute_log_likelihoods_from_input_dict", + error=True, + ) + def action_log_likelihood(self, batch: SampleBatchType) -> TensorType: + log_likelihoods = compute_log_likelihoods_from_input_dict(self.policy, batch) + return convert_to_numpy(log_likelihoods) diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/utils/__init__.py b/.venv/lib/python3.11/site-packages/ray/rllib/utils/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..01f8404da2f07ad128cfdb1f7efff6fcebc63e7b --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/rllib/utils/__init__.py @@ -0,0 +1,141 @@ +import contextlib +from functools import partial + +from ray.rllib.utils.annotations import override, PublicAPI, DeveloperAPI +from ray.rllib.utils.deprecation import deprecation_warning +from ray.rllib.utils.filter import Filter +from ray.rllib.utils.filter_manager import FilterManager +from ray.rllib.utils.framework import ( + try_import_jax, + try_import_tf, + try_import_tfp, + try_import_torch, +) +from ray.rllib.utils.numpy import ( + sigmoid, + softmax, + relu, + one_hot, + fc, + lstm, + SMALL_NUMBER, + LARGE_INTEGER, + MIN_LOG_NN_OUTPUT, + MAX_LOG_NN_OUTPUT, +) +from ray.rllib.utils.schedules import ( + LinearSchedule, + PiecewiseSchedule, + PolynomialSchedule, + ExponentialSchedule, + ConstantSchedule, +) +from ray.rllib.utils.test_utils import ( + check, + check_compute_single_action, + check_train_results, +) +from ray.tune.utils import merge_dicts, deep_update + + +@DeveloperAPI +def add_mixins(base, mixins, reversed=False): + """Returns a new class with mixins applied in priority order.""" + + mixins = list(mixins or []) + + while mixins: + if reversed: + + class new_base(base, mixins.pop()): + pass + + else: + + class new_base(mixins.pop(), base): + pass + + base = new_base + + return base + + +@DeveloperAPI +def force_list(elements=None, to_tuple=False): + """ + Makes sure `elements` is returned as a list, whether `elements` is a single + item, already a list, or a tuple. + + Args: + elements (Optional[any]): The inputs as single item, list, or tuple to + be converted into a list/tuple. If None, returns empty list/tuple. + to_tuple: Whether to use tuple (instead of list). + + Returns: + Union[list,tuple]: All given elements in a list/tuple depending on + `to_tuple`'s value. If elements is None, + returns an empty list/tuple. + """ + ctor = list + if to_tuple is True: + ctor = tuple + return ( + ctor() + if elements is None + else ctor(elements) + if type(elements) in [list, set, tuple] + else ctor([elements]) + ) + + +@DeveloperAPI +class NullContextManager(contextlib.AbstractContextManager): + """No-op context manager""" + + def __init__(self): + pass + + def __enter__(self): + pass + + def __exit__(self, *args): + pass + + +force_tuple = partial(force_list, to_tuple=True) + +__all__ = [ + "add_mixins", + "check", + "check_compute_single_action", + "check_train_results", + "deep_update", + "deprecation_warning", + "fc", + "force_list", + "force_tuple", + "lstm", + "merge_dicts", + "one_hot", + "override", + "relu", + "sigmoid", + "softmax", + "try_import_jax", + "try_import_tf", + "try_import_tfp", + "try_import_torch", + "ConstantSchedule", + "DeveloperAPI", + "ExponentialSchedule", + "Filter", + "FilterManager", + "LARGE_INTEGER", + "LinearSchedule", + "MAX_LOG_NN_OUTPUT", + "MIN_LOG_NN_OUTPUT", + "PiecewiseSchedule", + "PolynomialSchedule", + "PublicAPI", + "SMALL_NUMBER", +] diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/utils/__pycache__/checkpoints.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/utils/__pycache__/checkpoints.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..16a6113923e5e7485700539b5855c8e137d4fc40 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/utils/__pycache__/checkpoints.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/utils/__pycache__/compression.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/utils/__pycache__/compression.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..cf3824e39c119e0332a3c8a45cbe069cc9b35ed3 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/utils/__pycache__/compression.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/utils/__pycache__/deprecation.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/utils/__pycache__/deprecation.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..f4b8c6bb6d034408d63102dfcbfecdc710c4341c Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/utils/__pycache__/deprecation.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/utils/__pycache__/from_config.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/utils/__pycache__/from_config.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..54b78524e3a32e596d2d2fa82219a28c611c8e83 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/utils/__pycache__/from_config.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/utils/__pycache__/lambda_defaultdict.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/utils/__pycache__/lambda_defaultdict.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..98c7e899f1cb2f49c63c98c467148021f7873f5f Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/utils/__pycache__/lambda_defaultdict.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/utils/__pycache__/memory.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/utils/__pycache__/memory.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..c2274fbc35b85a2b14a85579ff1bcd2fc9460432 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/utils/__pycache__/memory.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/utils/__pycache__/serialization.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/utils/__pycache__/serialization.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..0295a6f3b052269b3eb51ab7168996447a1b8871 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/utils/__pycache__/serialization.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/utils/__pycache__/torch_utils.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/utils/__pycache__/torch_utils.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..b7dcee799cb3463e244dbe77135932393a476342 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/utils/__pycache__/torch_utils.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/utils/actors.py b/.venv/lib/python3.11/site-packages/ray/rllib/utils/actors.py new file mode 100644 index 0000000000000000000000000000000000000000..d56dcdbd773f920e6b922eeeceb71e4eb663d68c --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/rllib/utils/actors.py @@ -0,0 +1,258 @@ +from collections import defaultdict, deque +import logging +import platform +from typing import Any, Dict, List, Optional, Sequence, Tuple, Type + +import ray +from ray.actor import ActorClass, ActorHandle + +logger = logging.getLogger(__name__) + + +class TaskPool: + """Helper class for tracking the status of many in-flight actor tasks.""" + + def __init__(self): + self._tasks = {} + self._objects = {} + self._fetching = deque() + + def add(self, worker, all_obj_refs): + if isinstance(all_obj_refs, list): + obj_ref = all_obj_refs[0] + else: + obj_ref = all_obj_refs + self._tasks[obj_ref] = worker + self._objects[obj_ref] = all_obj_refs + + def completed(self, blocking_wait=False): + pending = list(self._tasks) + if pending: + ready, _ = ray.wait(pending, num_returns=len(pending), timeout=0) + if not ready and blocking_wait: + ready, _ = ray.wait(pending, num_returns=1, timeout=10.0) + for obj_ref in ready: + yield (self._tasks.pop(obj_ref), self._objects.pop(obj_ref)) + + def completed_prefetch(self, blocking_wait=False, max_yield=999): + """Similar to completed but only returns once the object is local. + + Assumes obj_ref only is one id.""" + + for worker, obj_ref in self.completed(blocking_wait=blocking_wait): + self._fetching.append((worker, obj_ref)) + + for _ in range(max_yield): + if not self._fetching: + break + + yield self._fetching.popleft() + + def reset_workers(self, workers): + """Notify that some workers may be removed.""" + for obj_ref, ev in self._tasks.copy().items(): + if ev not in workers: + del self._tasks[obj_ref] + del self._objects[obj_ref] + + # We want to keep the same deque reference so that we don't suffer from + # stale references in generators that are still in flight + for _ in range(len(self._fetching)): + ev, obj_ref = self._fetching.popleft() + if ev in workers: + # Re-queue items that are still valid + self._fetching.append((ev, obj_ref)) + + @property + def count(self): + return len(self._tasks) + + +def create_colocated_actors( + actor_specs: Sequence[Tuple[Type, Any, Any, int]], + node: Optional[str] = "localhost", + max_attempts: int = 10, +) -> Dict[Type, List[ActorHandle]]: + """Create co-located actors of any type(s) on any node. + + Args: + actor_specs: Tuple/list with tuples consisting of: 1) The + (already @ray.remote) class(es) to construct, 2) c'tor args, + 3) c'tor kwargs, and 4) the number of actors of that class with + given args/kwargs to construct. + node: The node to co-locate the actors on. By default ("localhost"), + place the actors on the node the caller of this function is + located on. Use None for indicating that any (resource fulfilling) + node in the cluster may be used. + max_attempts: The maximum number of co-location attempts to + perform before throwing an error. + + Returns: + A dict mapping the created types to the list of n ActorHandles + created (and co-located) for that type. + """ + if node == "localhost": + node = platform.node() + + # Maps each entry in `actor_specs` to lists of already co-located actors. + ok = [[] for _ in range(len(actor_specs))] + + # Try n times to co-locate all given actor types (`actor_specs`). + # With each (failed) attempt, increase the number of actors we try to + # create (on the same node), then kill the ones that have been created in + # excess. + for attempt in range(max_attempts): + # If any attempt to co-locate fails, set this to False and we'll do + # another attempt. + all_good = True + # Process all `actor_specs` in sequence. + for i, (typ, args, kwargs, count) in enumerate(actor_specs): + args = args or [] # Allow None. + kwargs = kwargs or {} # Allow None. + # We don't have enough actors yet of this spec co-located on + # the desired node. + if len(ok[i]) < count: + co_located = try_create_colocated( + cls=typ, + args=args, + kwargs=kwargs, + count=count * (attempt + 1), + node=node, + ) + # If node did not matter (None), from here on, use the host + # that the first actor(s) are already co-located on. + if node is None: + node = ray.get(co_located[0].get_host.remote()) + # Add the newly co-located actors to the `ok` list. + ok[i].extend(co_located) + # If we still don't have enough -> We'll have to do another + # attempt. + if len(ok[i]) < count: + all_good = False + # We created too many actors for this spec -> Kill/truncate + # the excess ones. + if len(ok[i]) > count: + for a in ok[i][count:]: + a.__ray_terminate__.remote() + ok[i] = ok[i][:count] + + # All `actor_specs` have been fulfilled, return lists of + # co-located actors. + if all_good: + return ok + + raise Exception("Unable to create enough colocated actors -> aborting.") + + +def try_create_colocated( + cls: Type[ActorClass], + args: List[Any], + count: int, + kwargs: Optional[List[Any]] = None, + node: Optional[str] = "localhost", +) -> List[ActorHandle]: + """Tries to co-locate (same node) a set of Actors of the same type. + + Returns a list of successfully co-located actors. All actors that could + not be co-located (with the others on the given node) will not be in this + list. + + Creates each actor via it's remote() constructor and then checks, whether + it has been co-located (on the same node) with the other (already created) + ones. If not, terminates the just created actor. + + Args: + cls: The Actor class to use (already @ray.remote "converted"). + args: List of args to pass to the Actor's constructor. One item + per to-be-created actor (`count`). + count: Number of actors of the given `cls` to construct. + kwargs: Optional list of kwargs to pass to the Actor's constructor. + One item per to-be-created actor (`count`). + node: The node to co-locate the actors on. By default ("localhost"), + place the actors on the node the caller of this function is + located on. If None, will try to co-locate all actors on + any available node. + + Returns: + List containing all successfully co-located actor handles. + """ + if node == "localhost": + node = platform.node() + + kwargs = kwargs or {} + actors = [cls.remote(*args, **kwargs) for _ in range(count)] + co_located, non_co_located = split_colocated(actors, node=node) + logger.info("Got {} colocated actors of {}".format(len(co_located), count)) + for a in non_co_located: + a.__ray_terminate__.remote() + return co_located + + +def split_colocated( + actors: List[ActorHandle], + node: Optional[str] = "localhost", +) -> Tuple[List[ActorHandle], List[ActorHandle]]: + """Splits up given actors into colocated (on same node) and non colocated. + + The co-location criterion depends on the `node` given: + If given (or default: platform.node()): Consider all actors that are on + that node "colocated". + If None: Consider the largest sub-set of actors that are all located on + the same node (whatever that node is) as "colocated". + + Args: + actors: The list of actor handles to split into "colocated" and + "non colocated". + node: The node defining "colocation" criterion. If provided, consider + thos actors "colocated" that sit on this node. If None, use the + largest subset within `actors` that are sitting on the same + (any) node. + + Returns: + Tuple of two lists: 1) Co-located ActorHandles, 2) non co-located + ActorHandles. + """ + if node == "localhost": + node = platform.node() + + # Get nodes of all created actors. + hosts = ray.get([a.get_host.remote() for a in actors]) + + # If `node` not provided, use the largest group of actors that sit on the + # same node, regardless of what that node is. + if node is None: + node_groups = defaultdict(set) + for host, actor in zip(hosts, actors): + node_groups[host].add(actor) + max_ = -1 + largest_group = None + for host in node_groups: + if max_ < len(node_groups[host]): + max_ = len(node_groups[host]) + largest_group = host + non_co_located = [] + for host in node_groups: + if host != largest_group: + non_co_located.extend(list(node_groups[host])) + return list(node_groups[largest_group]), non_co_located + # Node provided (or default: localhost): Consider those actors "colocated" + # that were placed on `node`. + else: + # Split into co-located (on `node) and non-co-located (not on `node`). + co_located = [] + non_co_located = [] + for host, a in zip(hosts, actors): + # This actor has been placed on the correct node. + if host == node: + co_located.append(a) + # This actor has been placed on a different node. + else: + non_co_located.append(a) + return co_located, non_co_located + + +def drop_colocated(actors: List[ActorHandle]) -> List[ActorHandle]: + colocated, non_colocated = split_colocated(actors) + for a in colocated: + a.__ray_terminate__.remote() + return non_colocated diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/utils/annotations.py b/.venv/lib/python3.11/site-packages/ray/rllib/utils/annotations.py new file mode 100644 index 0000000000000000000000000000000000000000..6824412b354f1f18df9d7a663e99471835680994 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/rllib/utils/annotations.py @@ -0,0 +1,213 @@ +from ray.rllib.utils.deprecation import Deprecated +from ray.util.annotations import _mark_annotated + + +def override(parent_cls): + """Decorator for documenting method overrides. + + Args: + parent_cls: The superclass that provides the overridden method. If + `parent_class` does not actually have the method or the class, in which + method is defined is not a subclass of `parent_class`, an error is raised. + + .. testcode:: + :skipif: True + + from ray.rllib.policy import Policy + class TorchPolicy(Policy): + ... + # Indicates that `TorchPolicy.loss()` overrides the parent + # Policy class' own `loss method. Leads to an error if Policy + # does not have a `loss` method. + + @override(Policy) + def loss(self, model, action_dist, train_batch): + ... + + """ + + class OverrideCheck: + def __init__(self, func, expected_parent_cls): + self.func = func + self.expected_parent_cls = expected_parent_cls + + def __set_name__(self, owner, name): + # Check if the owner (the class) is a subclass of the expected base class + if not issubclass(owner, self.expected_parent_cls): + raise TypeError( + f"When using the @override decorator, {owner.__name__} must be a " + f"subclass of {parent_cls.__name__}!" + ) + # Set the function as a regular method on the class. + setattr(owner, name, self.func) + + def decorator(method): + # Check, whether `method` is actually defined by the parent class. + if method.__name__ not in dir(parent_cls): + raise NameError( + f"When using the @override decorator, {method.__name__} must override " + f"the respective method (with the same name) of {parent_cls.__name__}!" + ) + + # Check if the class is a subclass of the expected base class + OverrideCheck(method, parent_cls) + return method + + return decorator + + +def PublicAPI(obj): + """Decorator for documenting public APIs. + + Public APIs are classes and methods exposed to end users of RLlib. You + can expect these APIs to remain stable across RLlib releases. + + Subclasses that inherit from a ``@PublicAPI`` base class can be + assumed part of the RLlib public API as well (e.g., all Algorithm classes + are in public API because Algorithm is ``@PublicAPI``). + + In addition, you can assume all algo configurations are part of their + public API as well. + + .. testcode:: + :skipif: True + + # Indicates that the `Algorithm` class is exposed to end users + # of RLlib and will remain stable across RLlib releases. + from ray import tune + @PublicAPI + class Algorithm(tune.Trainable): + ... + """ + + _mark_annotated(obj) + return obj + + +def DeveloperAPI(obj): + """Decorator for documenting developer APIs. + + Developer APIs are classes and methods explicitly exposed to developers + for the purposes of building custom algorithms or advanced training + strategies on top of RLlib internals. You can generally expect these APIs + to be stable sans minor changes (but less stable than public APIs). + + Subclasses that inherit from a ``@DeveloperAPI`` base class can be + assumed part of the RLlib developer API as well. + + .. testcode:: + :skipif: True + + # Indicates that the `TorchPolicy` class is exposed to end users + # of RLlib and will remain (relatively) stable across RLlib + # releases. + from ray.rllib.policy import Policy + @DeveloperAPI + class TorchPolicy(Policy): + ... + """ + + _mark_annotated(obj) + return obj + + +def ExperimentalAPI(obj): + """Decorator for documenting experimental APIs. + + Experimental APIs are classes and methods that are in development and may + change at any time in their development process. You should not expect + these APIs to be stable until their tag is changed to `DeveloperAPI` or + `PublicAPI`. + + Subclasses that inherit from a ``@ExperimentalAPI`` base class can be + assumed experimental as well. + + .. testcode:: + :skipif: True + + from ray.rllib.policy import Policy + class TorchPolicy(Policy): + ... + # Indicates that the `TorchPolicy.loss` method is a new and + # experimental API and may change frequently in future + # releases. + @ExperimentalAPI + def loss(self, model, action_dist, train_batch): + ... + """ + + _mark_annotated(obj) + return obj + + +def OldAPIStack(obj): + """Decorator for classes/methods/functions belonging to the old API stack. + + These should be deprecated at some point after Ray 3.0 (RLlib GA). + It is recommended for users to start exploring (and coding against) the new API + stack instead. + """ + # No effect yet. + + _mark_annotated(obj) + return obj + + +def OverrideToImplementCustomLogic(obj): + """Users should override this in their sub-classes to implement custom logic. + + Used in Algorithm and Policy to tag methods that need overriding, e.g. + `Policy.loss()`. + + .. testcode:: + :skipif: True + + from ray.rllib.policy.torch_policy import TorchPolicy + @overrides(TorchPolicy) + @OverrideToImplementCustomLogic + def loss(self, ...): + # implement custom loss function here ... + # ... w/o calling the corresponding `super().loss()` method. + ... + + """ + obj.__is_overridden__ = False + return obj + + +def OverrideToImplementCustomLogic_CallToSuperRecommended(obj): + """Users should override this in their sub-classes to implement custom logic. + + Thereby, it is recommended (but not required) to call the super-class' + corresponding method. + + Used in Algorithm and Policy to tag methods that need overriding, but the + super class' method should still be called, e.g. + `Algorithm.setup()`. + + .. testcode:: + :skipif: True + + from ray import tune + @overrides(tune.Trainable) + @OverrideToImplementCustomLogic_CallToSuperRecommended + def setup(self, config): + # implement custom setup logic here ... + super().setup(config) + # ... or here (after having called super()'s setup method. + """ + obj.__is_overridden__ = False + return obj + + +def is_overridden(obj): + """Check whether a function has been overridden. + + Note, this only works for API calls decorated with OverrideToImplementCustomLogic + or OverrideToImplementCustomLogic_CallToSuperRecommended. + """ + return getattr(obj, "__is_overridden__", True) + + +# Backward compatibility. +Deprecated = Deprecated diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/utils/checkpoints.py b/.venv/lib/python3.11/site-packages/ray/rllib/utils/checkpoints.py new file mode 100644 index 0000000000000000000000000000000000000000..1c8e9531fc34afbc57675f6d6f61bd137f9a5436 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/rllib/utils/checkpoints.py @@ -0,0 +1,1045 @@ +import abc +import inspect +import json +import logging +import os +from packaging import version +import pathlib +import re +import tempfile +from types import MappingProxyType +from typing import Any, Collection, Dict, List, Optional, Tuple, Union + +import pyarrow.fs + +import ray +import ray.cloudpickle as pickle +from ray.rllib.core import ( + COMPONENT_LEARNER, + COMPONENT_LEARNER_GROUP, + COMPONENT_RL_MODULE, +) +from ray.rllib.utils import force_list +from ray.rllib.utils.actor_manager import FaultTolerantActorManager +from ray.rllib.utils.annotations import ( + OldAPIStack, + OverrideToImplementCustomLogic_CallToSuperRecommended, +) +from ray.rllib.utils.serialization import NOT_SERIALIZABLE, serialize_type +from ray.rllib.utils.typing import StateDict +from ray.train import Checkpoint +from ray.tune.utils.file_transfer import sync_dir_between_nodes +from ray.util import log_once +from ray.util.annotations import PublicAPI + +logger = logging.getLogger(__name__) + +# The current checkpoint version used by RLlib for Algorithm and Policy checkpoints. +# History: +# 0.1: Ray 2.0.0 +# A single `checkpoint-[iter num]` file for Algorithm checkpoints +# within the checkpoint directory. Policy checkpoints not supported across all +# DL frameworks. + +# 1.0: Ray >=2.1.0 +# An algorithm_state.pkl file for the state of the Algorithm (excluding +# individual policy states). +# One sub-dir inside the "policies" sub-dir for each policy with a +# dedicated policy_state.pkl in it for the policy state. + +# 1.1: Same as 1.0, but has a new "format" field in the rllib_checkpoint.json file +# indicating, whether the checkpoint is `cloudpickle` (default) or `msgpack`. + +# 1.2: Introduces the checkpoint for the new Learner API if the Learner API is enabled. + +# 2.0: Introduces the Checkpointable API for all components on the new API stack +# (if the Learner-, RLModule, EnvRunner, and ConnectorV2 APIs are enabled). + +CHECKPOINT_VERSION = version.Version("1.1") +CHECKPOINT_VERSION_LEARNER_AND_ENV_RUNNER = version.Version("2.1") + + +@PublicAPI(stability="alpha") +class Checkpointable(abc.ABC): + """Abstract base class for a component of RLlib that can be checkpointed to disk. + + Subclasses must implement the following APIs: + - save_to_path() + - restore_from_path() + - from_checkpoint() + - get_state() + - set_state() + - get_ctor_args_and_kwargs() + - get_metadata() + - get_checkpointable_components() + """ + + # The state file for the implementing class. + # This file contains any state information that does NOT belong to any subcomponent + # of the implementing class (which are `Checkpointable` themselves and thus should + # have their own state- and metadata files). + # After a `save_to_path([path])` this file can be found directly in: `path/`. + STATE_FILE_NAME = "state" + + # The filename of the pickle file that contains the class information of the + # Checkpointable as well as all constructor args to be passed to such a class in + # order to construct a new instance. + CLASS_AND_CTOR_ARGS_FILE_NAME = "class_and_ctor_args.pkl" + + # Subclasses may set this to their own metadata filename. + # The dict returned by self.get_metadata() is stored in this JSON file. + METADATA_FILE_NAME = "metadata.json" + + def save_to_path( + self, + path: Optional[Union[str, pathlib.Path]] = None, + *, + state: Optional[StateDict] = None, + filesystem: Optional["pyarrow.fs.FileSystem"] = None, + use_msgpack: bool = False, + ) -> str: + """Saves the state of the implementing class (or `state`) to `path`. + + The state of the implementing class is always saved in the following format: + + .. testcode:: + :skipif: True + + path/ + [component1]/ + [component1 subcomponentA]/ + ... + [component1 subcomponentB]/ + ... + [component2]/ + ... + [cls.METADATA_FILE_NAME] (json) + [cls.STATE_FILE_NAME] (pkl|msgpack) + + The main logic is to loop through all subcomponents of this Checkpointable + and call their respective `save_to_path` methods. Then save the remaining + (non subcomponent) state to this Checkpointable's STATE_FILE_NAME. + In the exception that a component is a FaultTolerantActorManager instance, + instead of calling `save_to_path` directly on that manager, the first healthy + actor is interpreted as the component and its `save_to_path` method is called. + Even if that actor is located on another node, the created file is automatically + synced to the local node. + + Args: + path: The path to the directory to save the state of the implementing class + to. If `path` doesn't exist or is None, then a new directory will be + created (and returned). + state: An optional state dict to be used instead of getting a new state of + the implementing class through `self.get_state()`. + filesystem: PyArrow FileSystem to use to access data at the `path`. + If not specified, this is inferred from the URI scheme of `path`. + use_msgpack: Whether the state file should be written using msgpack and + msgpack_numpy (file extension is `.msgpack`), rather than pickle (file + extension is `.pkl`). + + Returns: + The path (str) where the state has been saved. + """ + + # If no path is given create a local temporary directory. + if path is None: + import uuid + + # Get the location of the temporary directory on the OS. + tmp_dir = pathlib.Path(tempfile.gettempdir()) + # Create a random directory name. + random_dir_name = str(uuid.uuid4()) + # Create the path, but do not craet the directory on the + # filesystem, yet. This is done by `PyArrow`. + path = path or tmp_dir / random_dir_name + + # We need a string path for `pyarrow.fs.FileSystem.from_uri`. + path = path if isinstance(path, str) else path.as_posix() + + # If we have no filesystem, figure it out. + if path and not filesystem: + # Note the path needs to be a path that is relative to the + # filesystem (e.g. `gs://tmp/...` -> `tmp/...`). + filesystem, path = pyarrow.fs.FileSystem.from_uri(path) + + # Make sure, path exists. + filesystem.create_dir(path, recursive=True) + + # Convert to `pathlib.Path` for easy handling. + path = pathlib.Path(path) + + # Write metadata file to disk. + metadata = self.get_metadata() + if "checkpoint_version" not in metadata: + metadata["checkpoint_version"] = str( + CHECKPOINT_VERSION_LEARNER_AND_ENV_RUNNER + ) + with filesystem.open_output_stream( + (path / self.METADATA_FILE_NAME).as_posix() + ) as f: + f.write(json.dumps(metadata).encode("utf-8")) + + # Write the class and constructor args information to disk. Always use pickle + # for this, because this information contains classes and maybe other + # non-serializable data. + with filesystem.open_output_stream( + (path / self.CLASS_AND_CTOR_ARGS_FILE_NAME).as_posix() + ) as f: + pickle.dump( + { + "class": type(self), + "ctor_args_and_kwargs": self.get_ctor_args_and_kwargs(), + }, + f, + ) + + # Get the entire state of this Checkpointable, or use provided `state`. + _state_provided = state is not None + state = state or self.get_state( + not_components=[c[0] for c in self.get_checkpointable_components()] + ) + + # Write components of `self` that themselves are `Checkpointable`. + for comp_name, comp in self.get_checkpointable_components(): + # If subcomponent's name is not in `state`, ignore it and don't write this + # subcomponent's state to disk. + if _state_provided and comp_name not in state: + continue + comp_path = path / comp_name + + # If component is an ActorManager, save the manager's first healthy + # actor's state to disk (even if it's on another node, in which case, we'll + # sync the generated file(s) back to this node). + if isinstance(comp, FaultTolerantActorManager): + actor_to_use = comp.healthy_actor_ids()[0] + + def _get_ip(_=None): + import ray + + return ray.util.get_node_ip_address() + + _result = next( + iter( + comp.foreach_actor( + _get_ip, + remote_actor_ids=[actor_to_use], + ) + ) + ) + if not _result.ok: + raise _result.get() + worker_ip_addr = _result.get() + self_ip_addr = _get_ip() + + # Save the state to a temporary location on the `actor_to_use`'s + # node. + comp_state_ref = None + if _state_provided: + comp_state_ref = ray.put(state.pop(comp_name)) + + if worker_ip_addr == self_ip_addr: + comp.foreach_actor( + lambda w, _path=comp_path, _state=comp_state_ref, _use_msgpack=use_msgpack: ( # noqa + w.save_to_path( + _path, + state=( + ray.get(_state) + if _state is not None + else w.get_state() + ), + use_msgpack=_use_msgpack, + ) + ), + remote_actor_ids=[actor_to_use], + ) + else: + # Save the checkpoint to the temporary directory on the worker. + def _save(w, _state=comp_state_ref, _use_msgpack=use_msgpack): + import tempfile + + # Create a temporary directory on the worker. + tmpdir = tempfile.mkdtemp() + w.save_to_path( + tmpdir, + state=( + ray.get(_state) if _state is not None else w.get_state() + ), + use_msgpack=_use_msgpack, + ) + return tmpdir + + _result = next( + iter(comp.foreach_actor(_save, remote_actor_ids=[actor_to_use])) + ) + if not _result.ok: + raise _result.get() + worker_temp_dir = _result.get() + + # Sync the temporary directory from the worker to this node. + sync_dir_between_nodes( + worker_ip_addr, + worker_temp_dir, + self_ip_addr, + str(comp_path), + ) + + # Remove the temporary directory on the worker. + def _rmdir(_, _dir=worker_temp_dir): + import shutil + + shutil.rmtree(_dir) + + comp.foreach_actor(_rmdir, remote_actor_ids=[actor_to_use]) + + # Local component (instance stored in a property of `self`). + else: + if _state_provided: + comp_state = state.pop(comp_name) + else: + comp_state = self.get_state(components=comp_name)[comp_name] + # By providing the `state` arg, we make sure that the component does not + # have to call its own `get_state()` anymore, but uses what's provided + # here. + comp.save_to_path( + comp_path, + filesystem=filesystem, + state=comp_state, + use_msgpack=use_msgpack, + ) + + # Write all the remaining state to disk. + filename = path / ( + self.STATE_FILE_NAME + (".msgpack" if use_msgpack else ".pkl") + ) + with filesystem.open_output_stream(filename.as_posix()) as f: + if use_msgpack: + msgpack = try_import_msgpack(error=True) + msgpack.dump(state, f) + else: + pickle.dump(state, f) + + return str(path) + + def restore_from_path( + self, + path: Union[str, pathlib.Path], + *, + component: Optional[str] = None, + filesystem: Optional["pyarrow.fs.FileSystem"] = None, + **kwargs, + ) -> None: + """Restores the state of the implementing class from the given path. + + If the `component` arg is provided, `path` refers to a checkpoint of a + subcomponent of `self`, thus allowing the user to load only the subcomponent's + state into `self` without affecting any of the other state information (for + example, loading only the NN state into a Checkpointable, which contains such + an NN, but also has other state information that should NOT be changed by + calling this method). + + The given `path` should have the following structure and contain the following + files: + + .. testcode:: + :skipif: True + + path/ + [component1]/ + [component1 subcomponentA]/ + ... + [component1 subcomponentB]/ + ... + [component2]/ + ... + [cls.METADATA_FILE_NAME] (json) + [cls.STATE_FILE_NAME] (pkl|msgpack) + + Note that the self.METADATA_FILE_NAME file is not required to restore the state. + + Args: + path: The path to load the implementing class' state from or to load the + state of only one subcomponent's state of the implementing class (if + `component` is provided). + component: If provided, `path` is interpreted as the checkpoint path of only + the subcomponent and thus, only that subcomponent's state is + restored/loaded. All other state of `self` remains unchanged in this + case. + filesystem: PyArrow FileSystem to use to access data at the `path`. If not + specified, this is inferred from the URI scheme of `path`. + **kwargs: Forward compatibility kwargs. + """ + path = path if isinstance(path, str) else path.as_posix() + + if path and not filesystem: + # Note the path needs to be a path that is relative to the + # filesystem (e.g. `gs://tmp/...` -> `tmp/...`). + filesystem, path = pyarrow.fs.FileSystem.from_uri(path) + # Only here convert to a `Path` instance b/c otherwise + # cloud path gets broken (i.e. 'gs://' -> 'gs:/'). + path = pathlib.Path(path) + + if not _exists_at_fs_path(filesystem, path.as_posix()): + raise FileNotFoundError(f"`path` ({path}) not found!") + + # Restore components of `self` that themselves are `Checkpointable`. + orig_comp_names = {c[0] for c in self.get_checkpointable_components()} + self._restore_all_subcomponents_from_path( + path, filesystem, component=component, **kwargs + ) + + # Restore the "base" state (not individual subcomponents). + if component is None: + filename = path / self.STATE_FILE_NAME + if filename.with_suffix(".msgpack").is_file(): + msgpack = try_import_msgpack(error=True) + with filesystem.open_input_stream( + filename.with_suffix(".msgpack").as_posix() + ) as f: + state = msgpack.load(f, strict_map_key=False) + else: + with filesystem.open_input_stream( + filename.with_suffix(".pkl").as_posix() + ) as f: + state = pickle.load(f) + self.set_state(state) + + new_comp_names = {c[0] for c in self.get_checkpointable_components()} + diff_comp_names = new_comp_names - orig_comp_names + if diff_comp_names: + self._restore_all_subcomponents_from_path( + path, filesystem, only_comp_names=diff_comp_names, **kwargs + ) + + @classmethod + def from_checkpoint( + cls, + path: Union[str, pathlib.Path], + filesystem: Optional["pyarrow.fs.FileSystem"] = None, + **kwargs, + ) -> "Checkpointable": + """Creates a new Checkpointable instance from the given location and returns it. + + Args: + path: The checkpoint path to load (a) the information on how to construct + a new instance of the implementing class and (b) the state to restore + the created instance to. + filesystem: PyArrow FileSystem to use to access data at the `path`. If not + specified, this is inferred from the URI scheme of `path`. + kwargs: Forward compatibility kwargs. Note that these kwargs are sent to + each subcomponent's `from_checkpoint()` call. + + Returns: + A new instance of the implementing class, already set to the state stored + under `path`. + """ + # We need a string path for the `PyArrow` filesystem. + path = path if isinstance(path, str) else path.as_posix() + + # If no filesystem is passed in create one. + if path and not filesystem: + # Note the path needs to be a path that is relative to the + # filesystem (e.g. `gs://tmp/...` -> `tmp/...`). + filesystem, path = pyarrow.fs.FileSystem.from_uri(path) + # Only here convert to a `Path` instance b/c otherwise + # cloud path gets broken (i.e. 'gs://' -> 'gs:/'). + path = pathlib.Path(path) + + # Get the class constructor to call and its args/kwargs. + # Try reading the pickle file first. + try: + with filesystem.open_input_stream( + (path / cls.CLASS_AND_CTOR_ARGS_FILE_NAME).as_posix() + ) as f: + ctor_info = pickle.load(f) + ctor = ctor_info["class"] + ctor_args = force_list(ctor_info["ctor_args_and_kwargs"][0]) + ctor_kwargs = ctor_info["ctor_args_and_kwargs"][1] + + # Inspect the ctor to see, which arguments in ctor_info should be replaced + # with the user provided **kwargs. + for i, (param_name, param) in enumerate( + inspect.signature(ctor).parameters.items() + ): + if param_name in kwargs: + val = kwargs.pop(param_name) + if ( + param.kind == inspect._ParameterKind.POSITIONAL_OR_KEYWORD + and len(ctor_args) > i + ): + ctor_args[i] = val + else: + ctor_kwargs[param_name] = val + + # If the pickle file is from another python version, use provided + # args instead. + except Exception: + # Use class that this method was called on. + ctor = cls + # Use only user provided **kwargs. + ctor_args = [] + ctor_kwargs = kwargs + + # Check, whether the constructor actually goes together with `cls`. + if not issubclass(ctor, cls): + raise ValueError( + f"The class ({ctor}) stored in checkpoint ({path}) does not seem to be " + f"a subclass of `cls` ({cls})!" + ) + elif not issubclass(ctor, Checkpointable): + raise ValueError( + f"The class ({ctor}) stored in checkpoint ({path}) does not seem to be " + "an implementer of the `Checkpointable` API!" + ) + + # Construct the initial object (without any particular state). + obj = ctor(*ctor_args, **ctor_kwargs) + # Restore the state of the constructed object. + obj.restore_from_path(path, filesystem=filesystem, **kwargs) + # Return the new object. + return obj + + @abc.abstractmethod + def get_state( + self, + components: Optional[Union[str, Collection[str]]] = None, + *, + not_components: Optional[Union[str, Collection[str]]] = None, + **kwargs, + ) -> StateDict: + """Returns the implementing class's current state as a dict. + + The returned dict must only contain msgpack-serializable data if you want to + use the `AlgorithmConfig._msgpack_checkpoints` option. Consider returning your + non msgpack-serializable data from the `Checkpointable.get_ctor_args_and_kwargs` + method, instead. + + Args: + components: An optional collection of string keys to be included in the + returned state. This might be useful, if getting certain components + of the state is expensive (e.g. reading/compiling the weights of a large + NN) and at the same time, these components are not required by the + caller. + not_components: An optional list of string keys to be excluded in the + returned state, even if the same string is part of `components`. + This is useful to get the complete state of the class, except + one or a few components. + kwargs: Forward-compatibility kwargs. + + Returns: + The current state of the implementing class (or only the `components` + specified, w/o those in `not_components`). + """ + + @abc.abstractmethod + def set_state(self, state: StateDict) -> None: + """Sets the implementing class' state to the given state dict. + + If component keys are missing in `state`, these components of the implementing + class will not be updated/set. + + Args: + state: The state dict to restore the state from. Maps component keys + to the corresponding subcomponent's own state. + """ + + @abc.abstractmethod + def get_ctor_args_and_kwargs(self) -> Tuple[Tuple, Dict[str, Any]]: + """Returns the args/kwargs used to create `self` from its constructor. + + Returns: + A tuple of the args (as a tuple) and kwargs (as a Dict[str, Any]) used to + construct `self` from its class constructor. + """ + + @OverrideToImplementCustomLogic_CallToSuperRecommended + def get_metadata(self) -> Dict: + """Returns JSON writable metadata further describing the implementing class. + + Note that this metadata is NOT part of any state and is thus NOT needed to + restore the state of a Checkpointable instance from a directory. Rather, the + metadata will be written into `self.METADATA_FILE_NAME` when calling + `self.save_to_path()` for the user's convenience. + + Returns: + A JSON-encodable dict of metadata information. + """ + return { + "class_and_ctor_args_file": self.CLASS_AND_CTOR_ARGS_FILE_NAME, + "state_file": self.STATE_FILE_NAME, + "ray_version": ray.__version__, + "ray_commit": ray.__commit__, + } + + def get_checkpointable_components(self) -> List[Tuple[str, "Checkpointable"]]: + """Returns the implementing class's own Checkpointable subcomponents. + + Returns: + A list of 2-tuples (name, subcomponent) describing the implementing class' + subcomponents, all of which have to be `Checkpointable` themselves and + whose state is therefore written into subdirectories (rather than the main + state file (self.STATE_FILE_NAME) when calling `self.save_to_path()`). + """ + return [] + + def _check_component(self, name, components, not_components) -> bool: + comp_list = force_list(components) + not_comp_list = force_list(not_components) + if ( + components is None + or any(c.startswith(name + "/") for c in comp_list) + or name in comp_list + ) and (not_components is None or name not in not_comp_list): + return True + return False + + def _get_subcomponents(self, name, components): + if components is None: + return None + + components = force_list(components) + subcomponents = [] + for comp in components: + if comp.startswith(name + "/"): + subcomponents.append(comp[len(name) + 1 :]) + + return None if not subcomponents else subcomponents + + def _restore_all_subcomponents_from_path( + self, path, filesystem, only_comp_names=None, component=None, **kwargs + ): + for comp_name, comp in self.get_checkpointable_components(): + if only_comp_names is not None and comp_name not in only_comp_names: + continue + + # The value of the `component` argument for the upcoming + # `[subcomponent].restore_from_path(.., component=..)` call. + comp_arg = None + + if component is None: + comp_dir = path / comp_name + # If subcomponent's dir is not in path, ignore it and don't restore this + # subcomponent's state from disk. + if not _exists_at_fs_path(filesystem, comp_dir.as_posix()): + continue + else: + comp_dir = path + + # `component` is a path that starts with `comp` -> Remove the name of + # `comp` from the `component` arg in the upcoming call to `restore_..`. + if component.startswith(comp_name + "/"): + comp_arg = component[len(comp_name) + 1 :] + # `component` has nothing to do with `comp` -> Skip. + elif component != comp_name: + continue + + # If component is an ActorManager, restore all the manager's healthy + # actors' states from disk (even if they are on another node, in which case, + # we'll sync checkpoint file(s) to the respective node). + if isinstance(comp, FaultTolerantActorManager): + head_node_ip = ray.util.get_node_ip_address() + all_healthy_actors = comp.healthy_actor_ids() + + def _restore( + w, + _kwargs=MappingProxyType(kwargs), + _path=comp_dir, + _head_ip=head_node_ip, + _comp_arg=comp_arg, + ): + import ray + import tempfile + + worker_node_ip = ray.util.get_node_ip_address() + # If the worker is on the same node as the head, load the checkpoint + # directly from the path otherwise sync the checkpoint from the head + # to the worker and load it from there. + if worker_node_ip == _head_ip: + w.restore_from_path(_path, component=_comp_arg, **_kwargs) + else: + with tempfile.TemporaryDirectory() as temp_dir: + sync_dir_between_nodes( + _head_ip, _path, worker_node_ip, temp_dir + ) + w.restore_from_path( + temp_dir, component=_comp_arg, **_kwargs + ) + + comp.foreach_actor(_restore, remote_actor_ids=all_healthy_actors) + + # Call `restore_from_path()` on local subcomponent, thereby passing in the + # **kwargs. + else: + comp.restore_from_path( + comp_dir, filesystem=filesystem, component=comp_arg, **kwargs + ) + + +def _exists_at_fs_path(fs: pyarrow.fs.FileSystem, path: str) -> bool: + """Returns `True` if the path can be found in the filesystem.""" + valid = fs.get_file_info(path) + return valid.type != pyarrow.fs.FileType.NotFound + + +def _is_dir(file_info: pyarrow.fs.FileInfo) -> bool: + """Returns `True`, if the file info is from a directory.""" + return file_info.type == pyarrow.fs.FileType.Directory + + +@OldAPIStack +def get_checkpoint_info( + checkpoint: Union[str, Checkpoint], + filesystem: Optional["pyarrow.fs.FileSystem"] = None, +) -> Dict[str, Any]: + """Returns a dict with information about an Algorithm/Policy checkpoint. + + If the given checkpoint is a >=v1.0 checkpoint directory, try reading all + information from the contained `rllib_checkpoint.json` file. + + Args: + checkpoint: The checkpoint directory (str) or an AIR Checkpoint object. + filesystem: PyArrow FileSystem to use to access data at the `checkpoint`. If not + specified, this is inferred from the URI scheme provided by `checkpoint`. + + Returns: + A dict containing the keys: + "type": One of "Policy" or "Algorithm". + "checkpoint_version": A version tuple, e.g. v1.0, indicating the checkpoint + version. This will help RLlib to remain backward compatible wrt. future + Ray and checkpoint versions. + "checkpoint_dir": The directory with all the checkpoint files in it. This might + be the same as the incoming `checkpoint` arg. + "state_file": The main file with the Algorithm/Policy's state information in it. + This is usually a pickle-encoded file. + "policy_ids": An optional set of PolicyIDs in case we are dealing with an + Algorithm checkpoint. None if `checkpoint` is a Policy checkpoint. + """ + # Default checkpoint info. + info = { + "type": "Algorithm", + "format": "cloudpickle", + "checkpoint_version": CHECKPOINT_VERSION, + "checkpoint_dir": None, + "state_file": None, + "policy_ids": None, + "module_ids": None, + } + + # `checkpoint` is a Checkpoint instance: Translate to directory and continue. + if isinstance(checkpoint, Checkpoint): + checkpoint = checkpoint.to_directory() + + if checkpoint and not filesystem: + # Note the path needs to be a path that is relative to the + # filesystem (e.g. `gs://tmp/...` -> `tmp/...`). + filesystem, checkpoint = pyarrow.fs.FileSystem.from_uri(checkpoint) + # Only here convert to a `Path` instance b/c otherwise + # cloud path gets broken (i.e. 'gs://' -> 'gs:/'). + checkpoint = pathlib.Path(checkpoint) + + # Checkpoint is dir. + if _exists_at_fs_path(filesystem, checkpoint.as_posix()) and _is_dir( + filesystem.get_file_info(checkpoint.as_posix()) + ): + info.update({"checkpoint_dir": str(checkpoint)}) + + # Figure out whether this is an older checkpoint format + # (with a `checkpoint-\d+` file in it). + file_info_list = filesystem.get_file_info( + pyarrow.fs.FileSelector(checkpoint.as_posix(), recursive=False) + ) + for file_info in file_info_list: + if file_info.is_file: + if re.match("checkpoint-\\d+", file_info.base_name): + info.update( + { + "checkpoint_version": version.Version("0.1"), + "state_file": str(file_info.base_name), + } + ) + return info + + # No old checkpoint file found. + + # If rllib_checkpoint.json file present, read available information from it + # and then continue with the checkpoint analysis (possibly overriding further + # information). + if _exists_at_fs_path( + filesystem, (checkpoint / "rllib_checkpoint.json").as_posix() + ): + # if (checkpoint / "rllib_checkpoint.json").is_file(): + with filesystem.open_input_stream( + (checkpoint / "rllib_checkpoint.json").as_posix() + ) as f: + # with open(checkpoint / "rllib_checkpoint.json") as f: + rllib_checkpoint_info = json.load(fp=f) + if "checkpoint_version" in rllib_checkpoint_info: + rllib_checkpoint_info["checkpoint_version"] = version.Version( + rllib_checkpoint_info["checkpoint_version"] + ) + info.update(rllib_checkpoint_info) + else: + # No rllib_checkpoint.json file present: Warn and continue trying to figure + # out checkpoint info ourselves. + if log_once("no_rllib_checkpoint_json_file"): + logger.warning( + "No `rllib_checkpoint.json` file found in checkpoint directory " + f"{checkpoint}! Trying to extract checkpoint info from other files " + f"found in that dir." + ) + + # Policy checkpoint file found. + for extension in ["pkl", "msgpck"]: + if _exists_at_fs_path( + filesystem, (checkpoint / ("policy_state." + extension)).as_posix() + ): + # if (checkpoint / ("policy_state." + extension)).is_file(): + info.update( + { + "type": "Policy", + "format": "cloudpickle" if extension == "pkl" else "msgpack", + "checkpoint_version": CHECKPOINT_VERSION, + "state_file": str(checkpoint / f"policy_state.{extension}"), + } + ) + return info + + # Valid Algorithm checkpoint >v0 file found? + format = None + for extension in ["pkl", "msgpck", "msgpack"]: + state_file = checkpoint / f"algorithm_state.{extension}" + if ( + _exists_at_fs_path(filesystem, state_file.as_posix()) + and filesystem.get_file_info(state_file.as_posix()).is_file + ): + format = "cloudpickle" if extension == "pkl" else "msgpack" + break + if format is None: + raise ValueError( + "Given checkpoint does not seem to be valid! No file with the name " + "`algorithm_state.[pkl|msgpack|msgpck]` (or `checkpoint-[0-9]+`) found." + ) + + info.update( + { + "format": format, + "state_file": str(state_file), + } + ) + + # Collect all policy IDs in the sub-dir "policies/". + policies_dir = checkpoint / "policies" + if _exists_at_fs_path(filesystem, policies_dir.as_posix()) and _is_dir( + filesystem.get_file_info(policies_dir.as_posix()) + ): + policy_ids = set() + file_info_list = filesystem.get_file_info( + pyarrow.fs.FileSelector(policies_dir.as_posix(), recursive=False) + ) + for file_info in file_info_list: + policy_ids.add(file_info.base_name) + info.update({"policy_ids": policy_ids}) + + # Collect all module IDs in the sub-dir "learner/module_state/". + modules_dir = ( + checkpoint + / COMPONENT_LEARNER_GROUP + / COMPONENT_LEARNER + / COMPONENT_RL_MODULE + ) + if _exists_at_fs_path(filesystem, checkpoint.as_posix()) and _is_dir( + filesystem.get_file_info(modules_dir.as_posix()) + ): + module_ids = set() + file_info_list = filesystem.get_file_info( + pyarrow.fs.FileSelector(modules_dir.as_posix(), recursive=False) + ) + for file_info in file_info_list: + # Only add subdirs (those are the ones where the RLModule data + # is stored, not files (could be json metadata files). + module_dir = modules_dir / file_info.base_name + if _is_dir(filesystem.get_file_info(module_dir.as_posix())): + module_ids.add(file_info.base_name) + info.update({"module_ids": module_ids}) + + # Checkpoint is a file: Use as-is (interpreting it as old Algorithm checkpoint + # version). + elif ( + _exists_at_fs_path(filesystem, checkpoint.as_posix()) + and filesystem.get_file_info(checkpoint.as_posix()).is_file + ): + info.update( + { + "checkpoint_version": version.Version("0.1"), + "checkpoint_dir": str(checkpoint.parent), + "state_file": str(checkpoint), + } + ) + + else: + raise ValueError( + f"Given checkpoint ({str(checkpoint)}) not found! Must be a " + "checkpoint directory (or a file for older checkpoint versions)." + ) + + return info + + +@OldAPIStack +def convert_to_msgpack_checkpoint( + checkpoint: Union[str, Checkpoint], + msgpack_checkpoint_dir: str, +) -> str: + """Converts an Algorithm checkpoint (pickle based) to a msgpack based one. + + Msgpack has the advantage of being python version independent. + + Args: + checkpoint: The directory, in which to find the Algorithm checkpoint (pickle + based). + msgpack_checkpoint_dir: The directory, in which to create the new msgpack + based checkpoint. + + Returns: + The directory in which the msgpack checkpoint has been created. Note that + this is the same as `msgpack_checkpoint_dir`. + """ + from ray.rllib.algorithms import Algorithm + from ray.rllib.algorithms.algorithm_config import AlgorithmConfig + from ray.rllib.core.rl_module import validate_module_id + + # Try to import msgpack and msgpack_numpy. + msgpack = try_import_msgpack(error=True) + + # Restore the Algorithm using the python version dependent checkpoint. + algo = Algorithm.from_checkpoint(checkpoint) + state = algo.__getstate__() + + # Convert all code in state into serializable data. + # Serialize the algorithm class. + state["algorithm_class"] = serialize_type(state["algorithm_class"]) + # Serialize the algorithm's config object. + if not isinstance(state["config"], dict): + state["config"] = state["config"].serialize() + else: + state["config"] = AlgorithmConfig._serialize_dict(state["config"]) + + # Extract policy states from worker state (Policies get their own + # checkpoint sub-dirs). + policy_states = {} + if "worker" in state and "policy_states" in state["worker"]: + policy_states = state["worker"].pop("policy_states", {}) + + # Policy mapping fn. + state["worker"]["policy_mapping_fn"] = NOT_SERIALIZABLE + # Is Policy to train function. + state["worker"]["is_policy_to_train"] = NOT_SERIALIZABLE + + # Add RLlib checkpoint version (as string). + state["checkpoint_version"] = str(CHECKPOINT_VERSION) + + # Write state (w/o policies) to disk. + state_file = os.path.join(msgpack_checkpoint_dir, "algorithm_state.msgpck") + with open(state_file, "wb") as f: + msgpack.dump(state, f) + + # Write rllib_checkpoint.json. + with open(os.path.join(msgpack_checkpoint_dir, "rllib_checkpoint.json"), "w") as f: + json.dump( + { + "type": "Algorithm", + "checkpoint_version": state["checkpoint_version"], + "format": "msgpack", + "state_file": state_file, + "policy_ids": list(policy_states.keys()), + "ray_version": ray.__version__, + "ray_commit": ray.__commit__, + }, + f, + ) + + # Write individual policies to disk, each in their own subdirectory. + for pid, policy_state in policy_states.items(): + # From here on, disallow policyIDs that would not work as directory names. + validate_module_id(pid, error=True) + policy_dir = os.path.join(msgpack_checkpoint_dir, "policies", pid) + os.makedirs(policy_dir, exist_ok=True) + policy = algo.get_policy(pid) + policy.export_checkpoint( + policy_dir, + policy_state=policy_state, + checkpoint_format="msgpack", + ) + + # Release all resources used by the Algorithm. + algo.stop() + + return msgpack_checkpoint_dir + + +@OldAPIStack +def convert_to_msgpack_policy_checkpoint( + policy_checkpoint: Union[str, Checkpoint], + msgpack_checkpoint_dir: str, +) -> str: + """Converts a Policy checkpoint (pickle based) to a msgpack based one. + + Msgpack has the advantage of being python version independent. + + Args: + policy_checkpoint: The directory, in which to find the Policy checkpoint (pickle + based). + msgpack_checkpoint_dir: The directory, in which to create the new msgpack + based checkpoint. + + Returns: + The directory in which the msgpack checkpoint has been created. Note that + this is the same as `msgpack_checkpoint_dir`. + """ + from ray.rllib.policy.policy import Policy + + policy = Policy.from_checkpoint(policy_checkpoint) + + os.makedirs(msgpack_checkpoint_dir, exist_ok=True) + policy.export_checkpoint( + msgpack_checkpoint_dir, + policy_state=policy.get_state(), + checkpoint_format="msgpack", + ) + + # Release all resources used by the Policy. + del policy + + return msgpack_checkpoint_dir + + +@PublicAPI +def try_import_msgpack(error: bool = False): + """Tries importing msgpack and msgpack_numpy and returns the patched msgpack module. + + Returns None if error is False and msgpack or msgpack_numpy is not installed. + Raises an error, if error is True and the modules could not be imported. + + Args: + error: Whether to raise an error if msgpack/msgpack_numpy cannot be imported. + + Returns: + The `msgpack` module. + + Raises: + ImportError: If error=True and msgpack/msgpack_numpy is not installed. + """ + try: + import msgpack + import msgpack_numpy + + # Make msgpack_numpy look like msgpack. + msgpack_numpy.patch() + + return msgpack + + except Exception: + if error: + raise ImportError( + "Could not import or setup msgpack and msgpack_numpy! " + "Try running `pip install msgpack msgpack_numpy` first." + ) diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/utils/deprecation.py b/.venv/lib/python3.11/site-packages/ray/rllib/utils/deprecation.py new file mode 100644 index 0000000000000000000000000000000000000000..7f5dd0e78b8baa7a3df81cebacaeb25a25a035ae --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/rllib/utils/deprecation.py @@ -0,0 +1,134 @@ +import inspect +import logging +from typing import Optional, Union + +from ray.util import log_once +from ray.util.annotations import _mark_annotated + +logger = logging.getLogger(__name__) + +# A constant to use for any configuration that should be deprecated +# (to check, whether this config has actually been assigned a proper value or +# not). +DEPRECATED_VALUE = -1 + + +def deprecation_warning( + old: str, + new: Optional[str] = None, + *, + help: Optional[str] = None, + error: Optional[Union[bool, Exception]] = None, +) -> None: + """Warns (via the `logger` object) or throws a deprecation warning/error. + + Args: + old: A description of the "thing" that is to be deprecated. + new: A description of the new "thing" that replaces it. + help: An optional help text to tell the user, what to + do instead of using `old`. + error: Whether or which exception to raise. If True, raise ValueError. + If False, just warn. If `error` is-a subclass of Exception, + raise that Exception. + + Raises: + ValueError: If `error=True`. + Exception: Of type `error`, iff `error` is a sub-class of `Exception`. + """ + msg = "`{}` has been deprecated.{}".format( + old, (" Use `{}` instead.".format(new) if new else f" {help}" if help else "") + ) + + if error: + if not isinstance(error, bool) and issubclass(error, Exception): + # error is an Exception + raise error(msg) + else: + # error is a boolean, construct ValueError ourselves + raise ValueError(msg) + else: + logger.warning( + "DeprecationWarning: " + msg + " This will raise an error in the future!" + ) + + +def Deprecated(old=None, *, new=None, help=None, error): + """Decorator for documenting a deprecated class, method, or function. + + Automatically adds a `deprecation.deprecation_warning(old=..., + error=False)` to not break existing code at this point to the decorated + class' constructor, method, or function. + + In a next major release, this warning should then be made an error + (by setting error=True), which means at this point that the + class/method/function is no longer supported, but will still inform + the user about the deprecation event. + + In a further major release, the class, method, function should be erased + entirely from the codebase. + + + .. testcode:: + :skipif: True + + from ray.rllib.utils.deprecation import Deprecated + # Deprecated class: Patches the constructor to warn if the class is + # used. + @Deprecated(new="NewAndMuchCoolerClass", error=False) + class OldAndUncoolClass: + ... + + # Deprecated class method: Patches the method to warn if called. + class StillCoolClass: + ... + @Deprecated(new="StillCoolClass.new_and_much_cooler_method()", + error=False) + def old_and_uncool_method(self, uncool_arg): + ... + + # Deprecated function: Patches the function to warn if called. + @Deprecated(new="new_and_much_cooler_function", error=False) + def old_and_uncool_function(*uncool_args): + ... + """ + + def _inner(obj): + # A deprecated class. + if inspect.isclass(obj): + # Patch the class' init method to raise the warning/error. + obj_init = obj.__init__ + + def patched_init(*args, **kwargs): + if log_once(old or obj.__name__): + deprecation_warning( + old=old or obj.__name__, + new=new, + help=help, + error=error, + ) + return obj_init(*args, **kwargs) + + obj.__init__ = patched_init + _mark_annotated(obj) + # Return the patched class (with the warning/error when + # instantiated). + return obj + + # A deprecated class method or function. + # Patch with the warning/error at the beginning. + def _ctor(*args, **kwargs): + if log_once(old or obj.__name__): + deprecation_warning( + old=old or obj.__name__, + new=new, + help=help, + error=error, + ) + # Call the deprecated method/function. + return obj(*args, **kwargs) + + # Return the patched class method/function. + return _ctor + + # Return the prepared decorator. + return _inner diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/utils/error.py b/.venv/lib/python3.11/site-packages/ray/rllib/utils/error.py new file mode 100644 index 0000000000000000000000000000000000000000..d2b9db4c351a38b8fbb6acf5c660a15db807c1a6 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/rllib/utils/error.py @@ -0,0 +1,128 @@ +from ray.rllib.utils.annotations import PublicAPI + + +@PublicAPI +class UnsupportedSpaceException(Exception): + """Error for an unsupported action or observation space.""" + + pass + + +@PublicAPI +class EnvError(Exception): + """Error if we encounter an error during RL environment validation.""" + + pass + + +@PublicAPI +class MultiAgentEnvError(Exception): + """Error if we encounter an error during MultiAgentEnv stepping/validation.""" + + pass + + +@PublicAPI +class NotSerializable(Exception): + """Error if we encounter objects that can't be serialized by ray.""" + + pass + + +# ------- +# Error messages +# ------- + +# Message explaining there are no GPUs available for the +# num_gpus=n or num_gpus_per_env_runner=m settings. +ERR_MSG_NO_GPUS = """Found {} GPUs on your machine (GPU devices found: {})! If your + machine does not have any GPUs, you should set the config keys + `num_gpus_per_learner` and `num_gpus_per_env_runner` to 0. They may be set to + 1 by default for your particular RL algorithm.""" + +ERR_MSG_INVALID_ENV_DESCRIPTOR = """The env string you provided ('{}') is: +a) Not a supported or -installed environment. +b) Not a tune-registered environment creator. +c) Not a valid env class string. + +Try one of the following: +a) For Atari support: `pip install gym[atari] autorom[accept-rom-license]`. + For PyBullet support: `pip install pybullet`. +b) To register your custom env, do `from ray import tune; + tune.register('[name]', lambda cfg: [return env obj from here using cfg])`. + Then in your config, do `config['env'] = [name]`. +c) Make sure you provide a fully qualified classpath, e.g.: + `ray.rllib.examples.envs.classes.repeat_after_me_env.RepeatAfterMeEnv` +""" + + +ERR_MSG_OLD_GYM_API = """Your environment ({}) does not abide to the new gymnasium-style API! +From Ray 2.3 on, RLlib only supports the new (gym>=0.26 or gymnasium) Env APIs. +{} +Learn more about the most important changes here: +https://github.com/openai/gym and here: https://github.com/Farama-Foundation/Gymnasium + +In order to fix this problem, do the following: + +1) Run `pip install gymnasium` on your command line. +2) Change all your import statements in your code from + `import gym` -> `import gymnasium as gym` OR + `from gym.spaces import Discrete` -> `from gymnasium.spaces import Discrete` + +For your custom (single agent) gym.Env classes: +3.1) Either wrap your old Env class via the provided `from gymnasium.wrappers import + EnvCompatibility` wrapper class. +3.2) Alternatively to 3.1: + - Change your `reset()` method to have the call signature 'def reset(self, *, + seed=None, options=None)' + - Return an additional info dict (empty dict should be fine) from your `reset()` + method. + - Return an additional `truncated` flag from your `step()` method (between `done` and + `info`). This flag should indicate, whether the episode was terminated prematurely + due to some time constraint or other kind of horizon setting. + +For your custom RLlib `MultiAgentEnv` classes: +4.1) Either wrap your old MultiAgentEnv via the provided + `from ray.rllib.env.wrappers.multi_agent_env_compatibility import + MultiAgentEnvCompatibility` wrapper class. +4.2) Alternatively to 4.1: + - Change your `reset()` method to have the call signature + 'def reset(self, *, seed=None, options=None)' + - Return an additional per-agent info dict (empty dict should be fine) from your + `reset()` method. + - Rename `dones` into `terminateds` and only set this to True, if the episode is really + done (as opposed to has been terminated prematurely due to some horizon/time-limit + setting). + - Return an additional `truncateds` per-agent dictionary flag from your `step()` + method, including the `__all__` key (100% analogous to your `dones/terminateds` + per-agent dict). + Return this new `truncateds` dict between `dones/terminateds` and `infos`. This + flag should indicate, whether the episode (for some agent or all agents) was + terminated prematurely due to some time constraint or other kind of horizon setting. +""" # noqa + + +ERR_MSG_TF_POLICY_CANNOT_SAVE_KERAS_MODEL = """Could not save keras model under self[TfPolicy].model.base_model! + This is either due to .. + a) .. this Policy's ModelV2 not having any `base_model` (tf.keras.Model) property + b) .. the ModelV2's `base_model` not being used by the Algorithm and thus its + variables not being properly initialized. +""" # noqa + +ERR_MSG_TORCH_POLICY_CANNOT_SAVE_MODEL = """Could not save torch model under self[TorchPolicy].model! + This is most likely due to the fact that you are using an Algorithm that + uses a Catalog-generated TorchModelV2 subclass, which is torch.save() cannot pickle. +""" # noqa + +# ------- +# HOWTO_ strings can be added to any error/warning/into message +# to eplain to the user, how to actually fix the encountered problem. +# ------- + +# HOWTO change the RLlib config, depending on how user runs the job. +HOWTO_CHANGE_CONFIG = """ +To change the config for `tune.Tuner().fit()` in a script: Modify the python dict + passed to `tune.Tuner(param_space=[...]).fit()`. +To change the config for an RLlib Algorithm instance: Modify the python dict + passed to the Algorithm's constructor, e.g. `PPO(config=[...])`. +""" diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/utils/filter_manager.py b/.venv/lib/python3.11/site-packages/ray/rllib/utils/filter_manager.py new file mode 100644 index 0000000000000000000000000000000000000000..b814ee163b84b0c1b61976a053d1e3f833a2376d --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/rllib/utils/filter_manager.py @@ -0,0 +1,82 @@ +import logging +from typing import Optional + +import ray +from ray.rllib.utils.annotations import OldAPIStack + +logger = logging.getLogger(__name__) + + +@OldAPIStack +class FilterManager: + """Manages filters and coordination across remote evaluators that expose + `get_filters` and `sync_filters`. + """ + + @staticmethod + def synchronize( + local_filters, + worker_set, + update_remote=True, + timeout_seconds: Optional[float] = None, + use_remote_data_for_update: bool = True, + ): + """Aggregates filters from remote workers (if use_remote_data_for_update=True). + + Local copy is updated and then broadcasted to all remote evaluators + (if `update_remote` is True). + + Args: + local_filters: Filters to be synchronized. + worker_set: EnvRunnerGroup with remote EnvRunners with filters. + update_remote: Whether to push updates from the local filters to the remote + workers' filters. + timeout_seconds: How long to wait for filter to get or set filters + use_remote_data_for_update: Whether to use the `worker_set`'s remote workers + to update the local filters. If False, stats from the remote workers + will not be used and discarded. + """ + # No sync/update required in either direction -> Early out. + if not (update_remote or use_remote_data_for_update): + return + + logger.debug(f"Synchronizing filters: {local_filters}") + + # Get the filters from the remote workers. + remote_filters = worker_set.foreach_env_runner( + func=lambda worker: worker.get_filters(flush_after=True), + local_env_runner=False, + timeout_seconds=timeout_seconds, + ) + if len(remote_filters) != worker_set.num_healthy_remote_workers(): + logger.error( + "Failed to get remote filters from a rollout worker in " + "FilterManager! " + "Filtered metrics may be computed, but filtered wrong." + ) + + # Should we utilize the remote workers' filter stats to update the local + # filters? + if use_remote_data_for_update: + for rf in remote_filters: + for k in local_filters: + local_filters[k].apply_changes(rf[k], with_buffer=False) + + # Should we update the remote workers' filters from the (now possibly synched) + # local filters? + if update_remote: + copies = {k: v.as_serializable() for k, v in local_filters.items()} + remote_copy = ray.put(copies) + + logger.debug("Updating remote filters ...") + results = worker_set.foreach_env_runner( + func=lambda worker: worker.sync_filters(ray.get(remote_copy)), + local_env_runner=False, + timeout_seconds=timeout_seconds, + ) + if len(results) != worker_set.num_healthy_remote_workers(): + logger.error( + "Failed to set remote filters to a rollout worker in " + "FilterManager. " + "Filtered metrics may be computed, but filtered wrong." + ) diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/utils/framework.py b/.venv/lib/python3.11/site-packages/ray/rllib/utils/framework.py new file mode 100644 index 0000000000000000000000000000000000000000..44143c766f6f100af0c243ad670ea4018967eb1f --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/rllib/utils/framework.py @@ -0,0 +1,402 @@ +import logging +import numpy as np +import os +import sys +from typing import Any, Optional, TYPE_CHECKING + +import tree # pip install dm_tree + +from ray.rllib.utils.annotations import DeveloperAPI, PublicAPI +from ray.rllib.utils.deprecation import Deprecated +from ray.rllib.utils.typing import ( + TensorShape, + TensorStructType, + TensorType, +) + +if TYPE_CHECKING: + from ray.rllib.algorithms.algorithm_config import AlgorithmConfig + +logger = logging.getLogger(__name__) + + +@PublicAPI +def convert_to_tensor( + data: TensorStructType, + framework: str, + device: Optional[str] = None, +): + """Converts any nested numpy struct into framework-specific tensors. + + Args: + data: The input data (numpy) to convert to framework-specific tensors. + framework: The framework to convert to. Only "torch" and "tf2" allowed. + device: An optional device name (for torch only). + + Returns: + The converted tensor struct matching the input data. + """ + if framework == "torch": + from ray.rllib.utils.torch_utils import convert_to_torch_tensor + + return convert_to_torch_tensor(data, device=device) + elif framework == "tf2": + _, tf, _ = try_import_tf() + + return tree.map_structure(lambda s: tf.convert_to_tensor(s), data) + raise NotImplementedError( + f"framework={framework} not supported in `convert_to_tensor()`!" + ) + + +@PublicAPI +def get_device(config: "AlgorithmConfig", num_gpus_requested: int = 1): + """Returns a single device (CPU or some GPU) depending on a config. + + Args: + config: An AlgorithmConfig to extract information from about the device to use. + num_gpus_requested: The number of GPUs actually requested. This may be the value + of `config.num_gpus_per_env_runner` when for example calling this function + from an EnvRunner. + + Returns: + A single device (or name) given `config` and `num_gpus_requested`. + """ + if config.framework_str == "torch": + torch, _ = try_import_torch() + + # TODO (Kourosh): How do we handle model parallelism? + # TODO (Kourosh): Instead of using _TorchAccelerator, we should use the public + # API in ray.train but allow for session to be None without any errors raised. + if num_gpus_requested > 0: + from ray.air._internal.torch_utils import get_devices + + # `get_devices()` returns a list that contains the 0th device if + # it is called from outside a Ray Train session. It's necessary to give + # the user the option to run on the gpu of their choice, so we enable that + # option here through `config.local_gpu_idx`. + devices = get_devices() + if len(devices) == 1: + return devices[0] + else: + assert config.local_gpu_idx < torch.cuda.device_count(), ( + f"local_gpu_idx {config.local_gpu_idx} is not a valid GPU ID " + "or is not available." + ) + # This is an index into the available CUDA devices. For example, if + # `os.environ["CUDA_VISIBLE_DEVICES"] = "1"` then + # `torch.cuda.device_count() = 1` and torch.device(0) maps to that GPU + # with ID=1 on the node. + return torch.device(config.local_gpu_idx) + else: + return torch.device("cpu") + else: + raise NotImplementedError( + f"`framework_str` {config.framework_str} not supported!" + ) + + +@PublicAPI +def try_import_jax(error: bool = False): + """Tries importing JAX and FLAX and returns both modules (or Nones). + + Args: + error: Whether to raise an error if JAX/FLAX cannot be imported. + + Returns: + Tuple containing the jax- and the flax modules. + + Raises: + ImportError: If error=True and JAX is not installed. + """ + if "RLLIB_TEST_NO_JAX_IMPORT" in os.environ: + logger.warning("Not importing JAX for test purposes.") + return None, None + + try: + import jax + import flax + except ImportError: + if error: + raise ImportError( + "Could not import JAX! RLlib requires you to " + "install at least one deep-learning framework: " + "`pip install [torch|tensorflow|jax]`." + ) + return None, None + + return jax, flax + + +@PublicAPI +def try_import_tf(error: bool = False): + """Tries importing tf and returns the module (or None). + + Args: + error: Whether to raise an error if tf cannot be imported. + + Returns: + Tuple containing + 1) tf1.x module (either from tf2.x.compat.v1 OR as tf1.x). + 2) tf module (resulting from `import tensorflow`). Either tf1.x or + 2.x. 3) The actually installed tf version as int: 1 or 2. + + Raises: + ImportError: If error=True and tf is not installed. + """ + tf_stub = _TFStub() + # Make sure, these are reset after each test case + # that uses them: del os.environ["RLLIB_TEST_NO_TF_IMPORT"] + if "RLLIB_TEST_NO_TF_IMPORT" in os.environ: + logger.warning("Not importing TensorFlow for test purposes") + return None, tf_stub, None + + if "TF_CPP_MIN_LOG_LEVEL" not in os.environ: + os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3" + + # Try to reuse already imported tf module. This will avoid going through + # the initial import steps below and thereby switching off v2_behavior + # (switching off v2 behavior twice breaks all-framework tests for eager). + was_imported = False + if "tensorflow" in sys.modules: + tf_module = sys.modules["tensorflow"] + was_imported = True + + else: + try: + import tensorflow as tf_module + except ImportError: + if error: + raise ImportError( + "Could not import TensorFlow! RLlib requires you to " + "install at least one deep-learning framework: " + "`pip install [torch|tensorflow|jax]`." + ) + return None, tf_stub, None + + # Try "reducing" tf to tf.compat.v1. + try: + tf1_module = tf_module.compat.v1 + tf1_module.logging.set_verbosity(tf1_module.logging.ERROR) + if not was_imported: + tf1_module.disable_v2_behavior() + tf1_module.enable_resource_variables() + tf1_module.logging.set_verbosity(tf1_module.logging.WARN) + # No compat.v1 -> return tf as is. + except AttributeError: + tf1_module = tf_module + + if not hasattr(tf_module, "__version__"): + version = 1 # sphinx doc gen + else: + version = 2 if "2." in tf_module.__version__[:2] else 1 + + return tf1_module, tf_module, version + + +# Fake module for tf. +class _TFStub: + def __init__(self) -> None: + self.keras = _KerasStub() + + def __bool__(self): + # if tf should return False + return False + + +# Fake module for tf.keras. +class _KerasStub: + def __init__(self) -> None: + self.Model = _FakeTfClassStub + + +# Fake classes under keras (e.g for tf.keras.Model) +class _FakeTfClassStub: + def __init__(self, *a, **kw): + raise ImportError("Could not import `tensorflow`. Try pip install tensorflow.") + + +@DeveloperAPI +def tf_function(tf_module): + """Conditional decorator for @tf.function. + + Use @tf_function(tf) instead to avoid errors if tf is not installed.""" + + # The actual decorator to use (pass in `tf` (which could be None)). + def decorator(func): + # If tf not installed -> return function as is (won't be used anyways). + if tf_module is None or tf_module.executing_eagerly(): + return func + # If tf installed, return @tf.function-decorated function. + return tf_module.function(func) + + return decorator + + +@PublicAPI +def try_import_tfp(error: bool = False): + """Tries importing tfp and returns the module (or None). + + Args: + error: Whether to raise an error if tfp cannot be imported. + + Returns: + The tfp module. + + Raises: + ImportError: If error=True and tfp is not installed. + """ + if "RLLIB_TEST_NO_TF_IMPORT" in os.environ: + logger.warning("Not importing TensorFlow Probability for test purposes.") + return None + + try: + import tensorflow_probability as tfp + + return tfp + except ImportError as e: + if error: + raise e + return None + + +# Fake module for torch.nn. +class _NNStub: + def __init__(self, *a, **kw): + # Fake nn.functional module within torch.nn. + self.functional = None + self.Module = _FakeTorchClassStub + self.parallel = _ParallelStub() + + +# Fake class for e.g. torch.nn.Module to allow it to be inherited from. +class _FakeTorchClassStub: + def __init__(self, *a, **kw): + raise ImportError("Could not import `torch`. Try pip install torch.") + + +class _ParallelStub: + def __init__(self, *a, **kw): + self.DataParallel = _FakeTorchClassStub + self.DistributedDataParallel = _FakeTorchClassStub + + +@PublicAPI +def try_import_torch(error: bool = False): + """Tries importing torch and returns the module (or None). + + Args: + error: Whether to raise an error if torch cannot be imported. + + Returns: + Tuple consisting of the torch- AND torch.nn modules. + + Raises: + ImportError: If error=True and PyTorch is not installed. + """ + if "RLLIB_TEST_NO_TORCH_IMPORT" in os.environ: + logger.warning("Not importing PyTorch for test purposes.") + return _torch_stubs() + + try: + import torch + import torch.nn as nn + + return torch, nn + except ImportError: + if error: + raise ImportError( + "Could not import PyTorch! RLlib requires you to " + "install at least one deep-learning framework: " + "`pip install [torch|tensorflow|jax]`." + ) + return _torch_stubs() + + +def _torch_stubs(): + nn = _NNStub() + return None, nn + + +@DeveloperAPI +def get_variable( + value: Any, + framework: str = "tf", + trainable: bool = False, + tf_name: str = "unnamed-variable", + torch_tensor: bool = False, + device: Optional[str] = None, + shape: Optional[TensorShape] = None, + dtype: Optional[TensorType] = None, +) -> Any: + """Creates a tf variable, a torch tensor, or a python primitive. + + Args: + value: The initial value to use. In the non-tf case, this will + be returned as is. In the tf case, this could be a tf-Initializer + object. + framework: One of "tf", "torch", or None. + trainable: Whether the generated variable should be + trainable (tf)/require_grad (torch) or not (default: False). + tf_name: For framework="tf": An optional name for the + tf.Variable. + torch_tensor: For framework="torch": Whether to actually create + a torch.tensor, or just a python value (default). + device: An optional torch device to use for + the created torch tensor. + shape: An optional shape to use iff `value` + does not have any (e.g. if it's an initializer w/o explicit value). + dtype: An optional dtype to use iff `value` does + not have any (e.g. if it's an initializer w/o explicit value). + This should always be a numpy dtype (e.g. np.float32, np.int64). + + Returns: + A framework-specific variable (tf.Variable, torch.tensor, or + python primitive). + """ + if framework in ["tf2", "tf"]: + import tensorflow as tf + + dtype = dtype or getattr( + value, + "dtype", + tf.float32 + if isinstance(value, float) + else tf.int32 + if isinstance(value, int) + else None, + ) + return tf.compat.v1.get_variable( + tf_name, + initializer=value, + dtype=dtype, + trainable=trainable, + **({} if shape is None else {"shape": shape}), + ) + elif framework == "torch" and torch_tensor is True: + torch, _ = try_import_torch() + if not isinstance(value, np.ndarray): + value = np.array(value) + var_ = torch.from_numpy(value) + if dtype in [torch.float32, np.float32]: + var_ = var_.float() + elif dtype in [torch.int32, np.int32]: + var_ = var_.int() + elif dtype in [torch.float64, np.float64]: + var_ = var_.double() + + if device: + var_ = var_.to(device) + var_.requires_grad = trainable + return var_ + # torch or None: Return python primitive. + return value + + +@Deprecated( + old="rllib/utils/framework.py::get_activation_fn", + new="rllib/models/utils.py::get_activation_fn", + error=True, +) +def get_activation_fn(name: Optional[str] = None, framework: str = "tf"): + pass diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/utils/from_config.py b/.venv/lib/python3.11/site-packages/ray/rllib/utils/from_config.py new file mode 100644 index 0000000000000000000000000000000000000000..522ba8dd28783f93d41cf257507808f38259c5a4 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/rllib/utils/from_config.py @@ -0,0 +1,325 @@ +from copy import deepcopy +from functools import partial +import importlib +import json +import os +import re +import yaml + +from ray.rllib.utils.annotations import DeveloperAPI +from ray.rllib.utils import force_list, merge_dicts + + +@DeveloperAPI +def from_config(cls, config=None, **kwargs): + """Uses the given config to create an object. + + If `config` is a dict, an optional "type" key can be used as a + "constructor hint" to specify a certain class of the object. + If `config` is not a dict, `config`'s value is used directly as this + "constructor hint". + + The rest of `config` (if it's a dict) will be used as kwargs for the + constructor. Additional keys in **kwargs will always have precedence + (overwrite keys in `config` (if a dict)). + Also, if the config-dict or **kwargs contains the special key "_args", + it will be popped from the dict and used as *args list to be passed + separately to the constructor. + + The following constructor hints are valid: + - None: Use `cls` as constructor. + - An already instantiated object: Will be returned as is; no + constructor call. + - A string or an object that is a key in `cls`'s `__type_registry__` + dict: The value in `__type_registry__` for that key will be used + as the constructor. + - A python callable: Use that very callable as constructor. + - A string: Either a json/yaml filename or the name of a python + module+class (e.g. "ray.rllib. [...] .[some class name]") + + Args: + cls: The class to build an instance for (from `config`). + config (Optional[dict, str]): The config dict or type-string or + filename. + + Keyword Args: + kwargs: Optional possibility to pass the constructor arguments in + here and use `config` as the type-only info. Then we can call + this like: from_config([type]?, [**kwargs for constructor]) + If `config` is already a dict, then `kwargs` will be merged + with `config` (overwriting keys in `config`) after "type" has + been popped out of `config`. + If a constructor of a Configurable needs *args, the special + key `_args` can be passed inside `kwargs` with a list value + (e.g. kwargs={"_args": [arg1, arg2, arg3]}). + + Returns: + any: The object generated from the config. + """ + # `cls` is the config (config is None). + if config is None and isinstance(cls, (dict, str)): + config = cls + cls = None + # `config` is already a created object of this class -> + # Take it as is. + elif isinstance(cls, type) and isinstance(config, cls): + return config + + # `type_`: Indicator for the Configurable's constructor. + # `ctor_args`: *args arguments for the constructor. + # `ctor_kwargs`: **kwargs arguments for the constructor. + # Try to copy, so caller can reuse safely. + try: + config = deepcopy(config) + except Exception: + pass + if isinstance(config, dict): + type_ = config.pop("type", None) + if type_ is None and isinstance(cls, str): + type_ = cls + ctor_kwargs = config + # Give kwargs priority over things defined in config dict. + # This way, one can pass a generic `spec` and then override single + # constructor parameters via the kwargs in the call to `from_config`. + ctor_kwargs.update(kwargs) + else: + type_ = config + if type_ is None and "type" in kwargs: + type_ = kwargs.pop("type") + ctor_kwargs = kwargs + # Special `_args` field in kwargs for *args-utilizing constructors. + ctor_args = force_list(ctor_kwargs.pop("_args", [])) + + # Figure out the actual constructor (class) from `type_`. + # None: Try __default__object (if no args/kwargs), only then + # constructor of cls (using args/kwargs). + if type_ is None: + # We have a default constructor that was defined directly by cls + # (not by its children). + if ( + cls is not None + and hasattr(cls, "__default_constructor__") + and cls.__default_constructor__ is not None + and ctor_args == [] + and ( + not hasattr(cls.__bases__[0], "__default_constructor__") + or cls.__bases__[0].__default_constructor__ is None + or cls.__bases__[0].__default_constructor__ + is not cls.__default_constructor__ + ) + ): + constructor = cls.__default_constructor__ + # Default constructor's keywords into ctor_kwargs. + if isinstance(constructor, partial): + kwargs = merge_dicts(ctor_kwargs, constructor.keywords) + constructor = partial(constructor.func, **kwargs) + ctor_kwargs = {} # erase to avoid duplicate kwarg error + # No default constructor -> Try cls itself as constructor. + else: + constructor = cls + # Try the __type_registry__ of this class. + else: + constructor = _lookup_type(cls, type_) + + # Found in cls.__type_registry__. + if constructor is not None: + pass + # type_ is False or None (and this value is not registered) -> + # return value of type_. + elif type_ is False or type_ is None: + return type_ + # Python callable. + elif callable(type_): + constructor = type_ + # A string: Filename or a python module+class or a json/yaml str. + elif isinstance(type_, str): + if re.search("\\.(yaml|yml|json)$", type_): + return from_file(cls, type_, *ctor_args, **ctor_kwargs) + # Try un-json/un-yaml'ing the string into a dict. + obj = yaml.safe_load(type_) + if isinstance(obj, dict): + return from_config(cls, obj) + try: + obj = from_config(cls, json.loads(type_)) + except json.JSONDecodeError: + pass + else: + return obj + + # Test for absolute module.class path specifier. + if type_.find(".") != -1: + module_name, function_name = type_.rsplit(".", 1) + try: + module = importlib.import_module(module_name) + constructor = getattr(module, function_name) + # Module not found. + except (ModuleNotFoundError, ImportError, AttributeError): + pass + + # If constructor still not found, try attaching cls' module, + # then look for type_ in there. + if constructor is None: + if isinstance(cls, str): + # Module found, but doesn't have the specified + # c'tor/function. + raise ValueError( + f"Full classpath specifier ({type_}) must be a valid " + "full [module].[class] string! E.g.: " + "`my.cool.module.MyCoolClass`." + ) + + try: + module = importlib.import_module(cls.__module__) + constructor = getattr(module, type_) + except (ModuleNotFoundError, ImportError, AttributeError): + # Try the package as well. + try: + package_name = importlib.import_module( + cls.__module__ + ).__package__ + module = __import__(package_name, fromlist=[type_]) + constructor = getattr(module, type_) + except (ModuleNotFoundError, ImportError, AttributeError): + pass + + if constructor is None: + raise ValueError( + f"String specifier ({type_}) must be a valid filename, " + f"a [module].[class], a class within '{cls.__module__}', " + f"or a key into {cls.__name__}.__type_registry__!" + ) + + if not constructor: + raise TypeError("Invalid type '{}'. Cannot create `from_config`.".format(type_)) + + # Create object with inferred constructor. + try: + object_ = constructor(*ctor_args, **ctor_kwargs) + # Catch attempts to construct from an abstract class and return None. + except TypeError as e: + if re.match("Can't instantiate abstract class", e.args[0]): + return None + raise e # Re-raise + # No sanity check for fake (lambda)-"constructors". + if type(constructor).__name__ != "function": + assert isinstance( + object_, + constructor.func if isinstance(constructor, partial) else constructor, + ) + + return object_ + + +@DeveloperAPI +def from_file(cls, filename, *args, **kwargs): + """ + Create object from config saved in filename. Expects json or yaml file. + + Args: + filename: File containing the config (json or yaml). + + Returns: + any: The object generated from the file. + """ + path = os.path.join(os.getcwd(), filename) + if not os.path.isfile(path): + raise FileNotFoundError("File '{}' not found!".format(filename)) + + with open(path, "rt") as fp: + if path.endswith(".yaml") or path.endswith(".yml"): + config = yaml.safe_load(fp) + else: + config = json.load(fp) + + # Add possible *args. + config["_args"] = args + return from_config(cls, config=config, **kwargs) + + +def _lookup_type(cls, type_): + if ( + cls is not None + and hasattr(cls, "__type_registry__") + and isinstance(cls.__type_registry__, dict) + and ( + type_ in cls.__type_registry__ + or ( + isinstance(type_, str) + and re.sub("[\\W_]", "", type_.lower()) in cls.__type_registry__ + ) + ) + ): + available_class_for_type = cls.__type_registry__.get(type_) + if available_class_for_type is None: + available_class_for_type = cls.__type_registry__[ + re.sub("[\\W_]", "", type_.lower()) + ] + return available_class_for_type + return None + + +class _NotProvided: + """Singleton class to provide a "not provided" value for AlgorithmConfig signatures. + + Using the only instance of this class indicates that the user does NOT wish to + change the value of some property. + + .. testcode:: + :skipif: True + + from ray.rllib.algorithms.algorithm_config import AlgorithmConfig + config = AlgorithmConfig() + # Print out the default learning rate. + print(config.lr) + + .. testoutput:: + + 0.001 + + .. testcode:: + :skipif: True + + # Print out the default `preprocessor_pref`. + print(config.preprocessor_pref) + + .. testoutput:: + + "deepmind" + + .. testcode:: + :skipif: True + + # Will only set the `preprocessor_pref` property (to None) and leave + # all other properties at their default values. + config.training(preprocessor_pref=None) + config.preprocessor_pref is None + + .. testoutput:: + + True + + .. testcode:: + :skipif: True + + # Still the same value (didn't touch it in the call to `.training()`. + print(config.lr) + + .. testoutput:: + + 0.001 + """ + + class __NotProvided: + pass + + instance = None + + def __init__(self): + if _NotProvided.instance is None: + _NotProvided.instance = _NotProvided.__NotProvided() + + +# Use this object as default values in all method signatures of +# AlgorithmConfig, indicating that the respective property should NOT be touched +# in the call. +NotProvided = _NotProvided() diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/utils/images.py b/.venv/lib/python3.11/site-packages/ray/rllib/utils/images.py new file mode 100644 index 0000000000000000000000000000000000000000..7b0f1601d574a5e34805b30cfed7ca9c391c1f0c --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/rllib/utils/images.py @@ -0,0 +1,60 @@ +import logging +import importlib + +import numpy as np + +from ray.rllib.utils.annotations import DeveloperAPI + +logger = logging.getLogger(__name__) + + +@DeveloperAPI +def is_package_installed(package_name): + try: + importlib.metadata.version(package_name) + return True + except importlib.metadata.PackageNotFoundError: + return False + + +try: + import cv2 + + cv2.ocl.setUseOpenCL(False) + + logger.debug("CV2 found for image processing.") +except ImportError as e: + if is_package_installed("opencv-python"): + raise ImportError( + f"OpenCV is installed, but we failed to import it. This may be because " + f"you need to install `opencv-python-headless` instead of " + f"`opencv-python`. Error message: {e}", + ) + cv2 = None + + +@DeveloperAPI +def resize(img: np.ndarray, height: int, width: int) -> np.ndarray: + if not cv2: + raise ModuleNotFoundError( + "`opencv` not installed! Do `pip install opencv-python`" + ) + return cv2.resize(img, (width, height), interpolation=cv2.INTER_AREA) + + +@DeveloperAPI +def rgb2gray(img: np.ndarray) -> np.ndarray: + if not cv2: + raise ModuleNotFoundError( + "`opencv` not installed! Do `pip install opencv-python`" + ) + return cv2.cvtColor(img, cv2.COLOR_RGB2GRAY) + + +@DeveloperAPI +def imread(img_file: str) -> np.ndarray: + if not cv2: + raise ModuleNotFoundError( + "`opencv` not installed! Do `pip install opencv-python`" + ) + return cv2.imread(img_file).astype(np.float32) diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/utils/lambda_defaultdict.py b/.venv/lib/python3.11/site-packages/ray/rllib/utils/lambda_defaultdict.py new file mode 100644 index 0000000000000000000000000000000000000000..ce4653961c565304570da87eea9066dc366643e0 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/rllib/utils/lambda_defaultdict.py @@ -0,0 +1,52 @@ +from collections import defaultdict +from typing import Any, Callable + + +class LambdaDefaultDict(defaultdict): + """A defaultdict that creates default values based on the associated key. + + Note that the standard defaultdict can only produce default values (via its factory) + that are independent of the key under which they are stored. + As opposed to that, the lambda functions used as factories for this + `LambdaDefaultDict` class do accept a single argument: The missing key. + If a missing key is accessed by the user, the provided lambda function is called + with this missing key as its argument. The returned value is stored in the + dictionary under that key and returned. + + Example: + + In this example, if you try to access a key that doesn't exist, it will call + the lambda function, passing it the missing key. The function will return a + string, which will be stored in the dictionary under that key. + + .. testcode:: + + from ray.rllib.utils.lambda_defaultdict import LambdaDefaultDict + + default_dict = LambdaDefaultDict(lambda missing_key: f"Value for {missing_key}") + print(default_dict["a"]) + + .. testoutput:: + + Value for a + """ # noqa: E501 + + def __init__(self, default_factory: Callable[[str], Any], *args, **kwargs): + """Initializes a LambdaDefaultDict instance. + + Args: + default_factory: The default factory callable, taking a string (key) + and returning the default value to use for that key. + """ + if not callable(default_factory): + raise TypeError("First argument must be a Callable!") + + # We will handle the factory in __missing__ method. + super().__init__(None, *args, **kwargs) + + self.default_factory = default_factory + + def __missing__(self, key): + # Call default factory with the key as argument. + self[key] = value = self.default_factory(key) + return value diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/utils/memory.py b/.venv/lib/python3.11/site-packages/ray/rllib/utils/memory.py new file mode 100644 index 0000000000000000000000000000000000000000..fe739cc0f99b8f23f68f31021b446cbf06f64d17 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/rllib/utils/memory.py @@ -0,0 +1,8 @@ +from ray.rllib.utils.deprecation import deprecation_warning +from ray.rllib.utils.numpy import aligned_array, concat_aligned # noqa + +deprecation_warning( + old="ray.rllib.utils.memory.[...]", + new="ray.rllib.utils.numpy.[...]", + error=True, +) diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/utils/numpy.py b/.venv/lib/python3.11/site-packages/ray/rllib/utils/numpy.py new file mode 100644 index 0000000000000000000000000000000000000000..831a4fbcf5365cae130d638569cad20454e5e9fd --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/rllib/utils/numpy.py @@ -0,0 +1,606 @@ +from collections import OrderedDict +from gymnasium.spaces import Discrete, MultiDiscrete +import numpy as np +import tree # pip install dm_tree +from types import MappingProxyType +from typing import List, Optional + + +from ray.rllib.utils.annotations import PublicAPI +from ray.rllib.utils.deprecation import Deprecated +from ray.rllib.utils.framework import try_import_tf, try_import_torch +from ray.rllib.utils.typing import SpaceStruct, TensorType, TensorStructType, Union + +tf1, tf, tfv = try_import_tf() +torch, _ = try_import_torch() + +SMALL_NUMBER = 1e-6 +# Some large int number. May be increased here, if needed. +LARGE_INTEGER = 100000000 +# Min and Max outputs (clipped) from an NN-output layer interpreted as the +# log(x) of some x (e.g. a stddev of a normal +# distribution). +MIN_LOG_NN_OUTPUT = -5 +MAX_LOG_NN_OUTPUT = 2 + + +@PublicAPI +@Deprecated( + help="RLlib itself has no use for this anymore.", + error=False, +) +def aligned_array(size: int, dtype, align: int = 64) -> np.ndarray: + """Returns an array of a given size that is 64-byte aligned. + + The returned array can be efficiently copied into GPU memory by TensorFlow. + + Args: + size: The size (total number of items) of the array. For example, + array([[0.0, 1.0], [2.0, 3.0]]) would have size=4. + dtype: The numpy dtype of the array. + align: The alignment to use. + + Returns: + A np.ndarray with the given specifications. + """ + n = size * dtype.itemsize + empty = np.empty(n + (align - 1), dtype=np.uint8) + data_align = empty.ctypes.data % align + offset = 0 if data_align == 0 else (align - data_align) + if n == 0: + # stop np from optimising out empty slice reference + output = empty[offset : offset + 1][0:0].view(dtype) + else: + output = empty[offset : offset + n].view(dtype) + + assert len(output) == size, len(output) + assert output.ctypes.data % align == 0, output.ctypes.data + return output + + +@PublicAPI +@Deprecated( + help="RLlib itself has no use for this anymore.", + error=False, +) +def concat_aligned( + items: List[np.ndarray], time_major: Optional[bool] = None +) -> np.ndarray: + """Concatenate arrays, ensuring the output is 64-byte aligned. + + We only align float arrays; other arrays are concatenated as normal. + + This should be used instead of np.concatenate() to improve performance + when the output array is likely to be fed into TensorFlow. + + Args: + items: The list of items to concatenate and align. + time_major: Whether the data in items is time-major, in which + case, we will concatenate along axis=1. + + Returns: + The concat'd and aligned array. + """ + + if len(items) == 0: + return [] + elif len(items) == 1: + # we assume the input is aligned. In any case, it doesn't help + # performance to force align it since that incurs a needless copy. + return items[0] + elif isinstance(items[0], np.ndarray) and items[0].dtype in [ + np.float32, + np.float64, + np.uint8, + ]: + dtype = items[0].dtype + flat = aligned_array(sum(s.size for s in items), dtype) + if time_major is not None: + if time_major is True: + batch_dim = sum(s.shape[1] for s in items) + new_shape = (items[0].shape[0], batch_dim,) + items[ + 0 + ].shape[2:] + else: + batch_dim = sum(s.shape[0] for s in items) + new_shape = (batch_dim, items[0].shape[1],) + items[ + 0 + ].shape[2:] + else: + batch_dim = sum(s.shape[0] for s in items) + new_shape = (batch_dim,) + items[0].shape[1:] + output = flat.reshape(new_shape) + assert output.ctypes.data % 64 == 0, output.ctypes.data + np.concatenate(items, out=output, axis=1 if time_major else 0) + return output + else: + return np.concatenate(items, axis=1 if time_major else 0) + + +@PublicAPI +def convert_to_numpy(x: TensorStructType, reduce_type: bool = True) -> TensorStructType: + """Converts values in `stats` to non-Tensor numpy or python types. + + Args: + x: Any (possibly nested) struct, the values in which will be + converted and returned as a new struct with all torch/tf tensors + being converted to numpy types. + reduce_type: Whether to automatically reduce all float64 and int64 data + into float32 and int32 data, respectively. + + Returns: + A new struct with the same structure as `x`, but with all + values converted to numpy arrays (on CPU). + """ + + # The mapping function used to numpyize torch/tf Tensors (and move them + # to the CPU beforehand). + def mapping(item): + if torch and isinstance(item, torch.Tensor): + ret = ( + item.cpu().item() + if len(item.size()) == 0 + else item.detach().cpu().numpy() + ) + elif ( + tf and isinstance(item, (tf.Tensor, tf.Variable)) and hasattr(item, "numpy") + ): + assert tf.executing_eagerly() + ret = item.numpy() + else: + ret = item + if reduce_type and isinstance(ret, np.ndarray): + if np.issubdtype(ret.dtype, np.floating): + ret = ret.astype(np.float32) + elif np.issubdtype(ret.dtype, int): + ret = ret.astype(np.int32) + return ret + return ret + + return tree.map_structure(mapping, x) + + +@PublicAPI +def fc( + x: np.ndarray, + weights: np.ndarray, + biases: Optional[np.ndarray] = None, + framework: Optional[str] = None, +) -> np.ndarray: + """Calculates FC (dense) layer outputs given weights/biases and input. + + Args: + x: The input to the dense layer. + weights: The weights matrix. + biases: The biases vector. All 0s if None. + framework: An optional framework hint (to figure out, + e.g. whether to transpose torch weight matrices). + + Returns: + The dense layer's output. + """ + + def map_(data, transpose=False): + if torch: + if isinstance(data, torch.Tensor): + data = data.cpu().detach().numpy() + if tf and tf.executing_eagerly(): + if isinstance(data, tf.Variable): + data = data.numpy() + if transpose: + data = np.transpose(data) + return data + + x = map_(x) + # Torch stores matrices in transpose (faster for backprop). + transpose = framework == "torch" and ( + x.shape[1] != weights.shape[0] and x.shape[1] == weights.shape[1] + ) + weights = map_(weights, transpose=transpose) + biases = map_(biases) + + return np.matmul(x, weights) + (0.0 if biases is None else biases) + + +@PublicAPI +def flatten_inputs_to_1d_tensor( + inputs: TensorStructType, + spaces_struct: Optional[SpaceStruct] = None, + time_axis: bool = False, + batch_axis: bool = True, +) -> TensorType: + """Flattens arbitrary input structs according to the given spaces struct. + + Returns a single 1D tensor resulting from the different input + components' values. + + Thereby: + - Boxes (any shape) get flattened to (B, [T]?, -1). Note that image boxes + are not treated differently from other types of Boxes and get + flattened as well. + - Discrete (int) values are one-hot'd, e.g. a batch of [1, 0, 3] (B=3 with + Discrete(4) space) results in [[0, 1, 0, 0], [1, 0, 0, 0], [0, 0, 0, 1]]. + - MultiDiscrete values are multi-one-hot'd, e.g. a batch of + [[0, 2], [1, 4]] (B=2 with MultiDiscrete([2, 5]) space) results in + [[1, 0, 0, 0, 1, 0, 0], [0, 1, 0, 0, 0, 0, 1]]. + + Args: + inputs: The inputs to be flattened. + spaces_struct: The (possibly nested) structure of the spaces that `inputs` + belongs to. + time_axis: Whether all inputs have a time-axis (after the batch axis). + If True, will keep not only the batch axis (0th), but the time axis + (1st) as-is and flatten everything from the 2nd axis up. + batch_axis: Whether all inputs have a batch axis. + If True, will keep that batch axis as-is and flatten everything from the + other dims up. + + Returns: + A single 1D tensor resulting from concatenating all + flattened/one-hot'd input components. Depending on the time_axis flag, + the shape is (B, n) or (B, T, n). + + .. testcode:: + :skipif: True + + # B=2 + from ray.rllib.utils.tf_utils import flatten_inputs_to_1d_tensor + from gymnasium.spaces import Discrete, Box + out = flatten_inputs_to_1d_tensor( + {"a": [1, 0], "b": [[[0.0], [0.1]], [1.0], [1.1]]}, + spaces_struct=dict(a=Discrete(2), b=Box(shape=(2, 1))) + ) + print(out) + + # B=2; T=2 + out = flatten_inputs_to_1d_tensor( + ([[1, 0], [0, 1]], + [[[0.0, 0.1], [1.0, 1.1]], [[2.0, 2.1], [3.0, 3.1]]]), + spaces_struct=tuple([Discrete(2), Box(shape=(2, ))]), + time_axis=True + ) + print(out) + + .. testoutput:: + + [[0.0, 1.0, 0.0, 0.1], [1.0, 0.0, 1.0, 1.1]] # B=2 n=4 + [[[0.0, 1.0, 0.0, 0.1], [1.0, 0.0, 1.0, 1.1]], + [[1.0, 0.0, 2.0, 2.1], [0.0, 1.0, 3.0, 3.1]]] # B=2 T=2 n=4 + """ + # `time_axis` must not be True if `batch_axis` is False. + assert not (time_axis and not batch_axis) + + flat_inputs = tree.flatten(inputs) + flat_spaces = ( + tree.flatten(spaces_struct) + if spaces_struct is not None + else [None] * len(flat_inputs) + ) + + B = None + T = None + out = [] + for input_, space in zip(flat_inputs, flat_spaces): + # Store batch and (if applicable) time dimension. + if B is None and batch_axis: + B = input_.shape[0] + if time_axis: + T = input_.shape[1] + + # One-hot encoding. + if isinstance(space, Discrete): + if time_axis: + input_ = np.reshape(input_, [B * T]) + out.append(one_hot(input_, depth=space.n).astype(np.float32)) + # Multi one-hot encoding. + elif isinstance(space, MultiDiscrete): + if time_axis: + input_ = np.reshape(input_, [B * T, -1]) + if batch_axis: + out.append( + np.concatenate( + [ + one_hot(input_[:, i], depth=n).astype(np.float32) + for i, n in enumerate(space.nvec) + ], + axis=-1, + ) + ) + else: + out.append( + np.concatenate( + [ + one_hot(input_[i], depth=n).astype(np.float32) + for i, n in enumerate(space.nvec) + ], + axis=-1, + ) + ) + # Box: Flatten. + else: + # Special case for spaces: Box(.., shape=(), ..) + if isinstance(input_, float): + input_ = np.array([input_]) + + if time_axis: + input_ = np.reshape(input_, [B * T, -1]) + elif batch_axis: + input_ = np.reshape(input_, [B, -1]) + else: + input_ = np.reshape(input_, [-1]) + out.append(input_.astype(np.float32)) + + merged = np.concatenate(out, axis=-1) + # Restore the time-dimension, if applicable. + if time_axis: + merged = np.reshape(merged, [B, T, -1]) + return merged + + +@PublicAPI +def make_action_immutable(obj): + """Flags actions immutable to notify users when trying to change them. + + Can also be used with any tree-like structure containing either + dictionaries, numpy arrays or already immutable objects per se. + Note, however that `tree.map_structure()` will in general not + include the shallow object containing all others and therefore + immutability will hold only for all objects contained in it. + Use `tree.traverse(fun, action, top_down=False)` to include + also the containing object. + + Args: + obj: The object to be made immutable. + + Returns: + The immutable object. + + .. testcode:: + :skipif: True + + import tree + import numpy as np + from ray.rllib.utils.numpy import make_action_immutable + arr = np.arange(1,10) + d = dict(a = 1, b = (arr, arr)) + tree.traverse(make_action_immutable, d, top_down=False) + """ + if isinstance(obj, np.ndarray): + obj.setflags(write=False) + return obj + elif isinstance(obj, OrderedDict): + return MappingProxyType(dict(obj)) + elif isinstance(obj, dict): + return MappingProxyType(obj) + else: + return obj + + +@PublicAPI +def huber_loss(x: np.ndarray, delta: float = 1.0) -> np.ndarray: + """Reference: https://en.wikipedia.org/wiki/Huber_loss.""" + return np.where( + np.abs(x) < delta, np.power(x, 2.0) * 0.5, delta * (np.abs(x) - 0.5 * delta) + ) + + +@PublicAPI +def l2_loss(x: np.ndarray) -> np.ndarray: + """Computes half the L2 norm of a tensor (w/o the sqrt): sum(x**2) / 2. + + Args: + x: The input tensor. + + Returns: + The l2-loss output according to the above formula given `x`. + """ + return np.sum(np.square(x)) / 2.0 + + +@PublicAPI +def lstm( + x, + weights: np.ndarray, + biases: Optional[np.ndarray] = None, + initial_internal_states: Optional[np.ndarray] = None, + time_major: bool = False, + forget_bias: float = 1.0, +): + """Calculates LSTM layer output given weights/biases, states, and input. + + Args: + x: The inputs to the LSTM layer including time-rank + (0th if time-major, else 1st) and the batch-rank + (1st if time-major, else 0th). + weights: The weights matrix. + biases: The biases vector. All 0s if None. + initial_internal_states: The initial internal + states to pass into the layer. All 0s if None. + time_major: Whether to use time-major or not. Default: False. + forget_bias: Gets added to first sigmoid (forget gate) output. + Default: 1.0. + + Returns: + Tuple consisting of 1) The LSTM layer's output and + 2) Tuple: Last (c-state, h-state). + """ + sequence_length = x.shape[0 if time_major else 1] + batch_size = x.shape[1 if time_major else 0] + units = weights.shape[1] // 4 # 4 internal layers (3x sigmoid, 1x tanh) + + if initial_internal_states is None: + c_states = np.zeros(shape=(batch_size, units)) + h_states = np.zeros(shape=(batch_size, units)) + else: + c_states = initial_internal_states[0] + h_states = initial_internal_states[1] + + # Create a placeholder for all n-time step outputs. + if time_major: + unrolled_outputs = np.zeros(shape=(sequence_length, batch_size, units)) + else: + unrolled_outputs = np.zeros(shape=(batch_size, sequence_length, units)) + + # Push the batch 4 times through the LSTM cell and capture the outputs plus + # the final h- and c-states. + for t in range(sequence_length): + input_matrix = x[t, :, :] if time_major else x[:, t, :] + input_matrix = np.concatenate((input_matrix, h_states), axis=1) + input_matmul_matrix = np.matmul(input_matrix, weights) + biases + # Forget gate (3rd slot in tf output matrix). Add static forget bias. + sigmoid_1 = sigmoid(input_matmul_matrix[:, units * 2 : units * 3] + forget_bias) + c_states = np.multiply(c_states, sigmoid_1) + # Add gate (1st and 2nd slots in tf output matrix). + sigmoid_2 = sigmoid(input_matmul_matrix[:, 0:units]) + tanh_3 = np.tanh(input_matmul_matrix[:, units : units * 2]) + c_states = np.add(c_states, np.multiply(sigmoid_2, tanh_3)) + # Output gate (last slot in tf output matrix). + sigmoid_4 = sigmoid(input_matmul_matrix[:, units * 3 : units * 4]) + h_states = np.multiply(sigmoid_4, np.tanh(c_states)) + + # Store this output time-slice. + if time_major: + unrolled_outputs[t, :, :] = h_states + else: + unrolled_outputs[:, t, :] = h_states + + return unrolled_outputs, (c_states, h_states) + + +@PublicAPI +def one_hot( + x: Union[TensorType, int], + depth: int = 0, + on_value: float = 1.0, + off_value: float = 0.0, + dtype: type = np.float32, +) -> np.ndarray: + """One-hot utility function for numpy. + + Thanks to qianyizhang: + https://gist.github.com/qianyizhang/07ee1c15cad08afb03f5de69349efc30. + + Args: + x: The input to be one-hot encoded. + depth: The max. number to be one-hot encoded (size of last rank). + on_value: The value to use for on. Default: 1.0. + off_value: The value to use for off. Default: 0.0. + + Returns: + The one-hot encoded equivalent of the input array. + """ + + # Handle simple ints properly. + if isinstance(x, int): + x = np.array(x, dtype=np.int32) + # Handle torch arrays properly. + elif torch and isinstance(x, torch.Tensor): + x = x.numpy() + + # Handle bool arrays correctly. + if x.dtype == np.bool_: + x = x.astype(np.int_) + depth = 2 + + # If depth is not given, try to infer it from the values in the array. + if depth == 0: + depth = np.max(x) + 1 + assert ( + np.max(x) < depth + ), "ERROR: The max. index of `x` ({}) is larger than depth ({})!".format( + np.max(x), depth + ) + shape = x.shape + + out = np.ones(shape=(*shape, depth)) * off_value + indices = [] + for i in range(x.ndim): + tiles = [1] * x.ndim + s = [1] * x.ndim + s[i] = -1 + r = np.arange(shape[i]).reshape(s) + if i > 0: + tiles[i - 1] = shape[i - 1] + r = np.tile(r, tiles) + indices.append(r) + indices.append(x) + out[tuple(indices)] = on_value + return out.astype(dtype) + + +@PublicAPI +def one_hot_multidiscrete(x, depths=List[int]): + # Handle torch arrays properly. + if torch and isinstance(x, torch.Tensor): + x = x.numpy() + + shape = x.shape + return np.concatenate( + [ + one_hot(x[i] if len(shape) == 1 else x[:, i], depth=n).astype(np.float32) + for i, n in enumerate(depths) + ], + axis=-1, + ) + + +@PublicAPI +def relu(x: np.ndarray, alpha: float = 0.0) -> np.ndarray: + """Implementation of the leaky ReLU function. + + y = x * alpha if x < 0 else x + + Args: + x: The input values. + alpha: A scaling ("leak") factor to use for negative x. + + Returns: + The leaky ReLU output for x. + """ + return np.maximum(x, x * alpha, x) + + +@PublicAPI +def sigmoid(x: np.ndarray, derivative: bool = False) -> np.ndarray: + """ + Returns the sigmoid function applied to x. + Alternatively, can return the derivative or the sigmoid function. + + Args: + x: The input to the sigmoid function. + derivative: Whether to return the derivative or not. + Default: False. + + Returns: + The sigmoid function (or its derivative) applied to x. + """ + if derivative: + return x * (1 - x) + else: + return 1 / (1 + np.exp(-x)) + + +@PublicAPI +def softmax( + x: Union[np.ndarray, list], axis: int = -1, epsilon: Optional[float] = None +) -> np.ndarray: + """Returns the softmax values for x. + + The exact formula used is: + S(xi) = e^xi / SUMj(e^xj), where j goes over all elements in x. + + Args: + x: The input to the softmax function. + axis: The axis along which to softmax. + epsilon: Optional epsilon as a minimum value. If None, use + `SMALL_NUMBER`. + + Returns: + The softmax over x. + """ + epsilon = epsilon or SMALL_NUMBER + # x_exp = np.maximum(np.exp(x), SMALL_NUMBER) + x_exp = np.exp(x) + # return x_exp / + # np.maximum(np.sum(x_exp, axis, keepdims=True), SMALL_NUMBER) + return np.maximum(x_exp / np.sum(x_exp, axis, keepdims=True), epsilon) diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/utils/postprocessing/__init__.py b/.venv/lib/python3.11/site-packages/ray/rllib/utils/postprocessing/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/utils/postprocessing/__pycache__/__init__.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/utils/postprocessing/__pycache__/__init__.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..66b1bc37c33b9361ffca7ebacdd3607793c891c2 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/utils/postprocessing/__pycache__/__init__.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/utils/postprocessing/__pycache__/episodes.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/utils/postprocessing/__pycache__/episodes.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..1225f24a4ea8ca1549802284768388e02b497fb1 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/utils/postprocessing/__pycache__/episodes.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/utils/postprocessing/__pycache__/value_predictions.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/utils/postprocessing/__pycache__/value_predictions.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..ba4d599bebd0e3a28c88c4c1ffa4740c2dc8a6f0 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/utils/postprocessing/__pycache__/value_predictions.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/utils/postprocessing/__pycache__/zero_padding.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/utils/postprocessing/__pycache__/zero_padding.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..dc98e41eeda8ad41e4b0ed6eced8bfff543e4810 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/utils/postprocessing/__pycache__/zero_padding.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/utils/postprocessing/episodes.py b/.venv/lib/python3.11/site-packages/ray/rllib/utils/postprocessing/episodes.py new file mode 100644 index 0000000000000000000000000000000000000000..fdc5b1321659366372b985116ad3cddcf8b2f899 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/rllib/utils/postprocessing/episodes.py @@ -0,0 +1,144 @@ +from typing import List, Tuple + +import numpy as np + +from ray.rllib.env.single_agent_episode import SingleAgentEpisode +from ray.util.annotations import DeveloperAPI + + +@DeveloperAPI +def add_one_ts_to_episodes_and_truncate(episodes: List[SingleAgentEpisode]): + """Adds an artificial timestep to an episode at the end. + + In detail: The last observations, infos, actions, and all `extra_model_outputs` + will be duplicated and appended to each episode's data. An extra 0.0 reward + will be appended to the episode's rewards. The episode's timestep will be + increased by 1. Also, adds the truncated=True flag to each episode if the + episode is not already done (terminated or truncated). + + Useful for value function bootstrapping, where it is required to compute a + forward pass for the very last timestep within the episode, + i.e. using the following input dict: { + obs=[final obs], + state=[final state output], + prev. reward=[final reward], + etc.. + } + + Args: + episodes: The list of SingleAgentEpisode objects to extend by one timestep + and add a truncation flag if necessary. + + Returns: + A list of the original episodes' truncated values (so the episodes can be + properly restored later into their original states). + """ + orig_truncateds = [] + for episode in episodes: + orig_truncateds.append(episode.is_truncated) + + # Add timestep. + episode.t += 1 + # Use the episode API that allows appending (possibly complex) structs + # to the data. + episode.observations.append(episode.observations[-1]) + episode.infos.append(episode.infos[-1]) + episode.actions.append(episode.actions[-1]) + episode.rewards.append(0.0) + for v in episode.extra_model_outputs.values(): + v.append(v[-1]) + # Artificially make this episode truncated for the upcoming GAE + # computations. + if not episode.is_done: + episode.is_truncated = True + # Validate to make sure, everything is in order. + episode.validate() + + return orig_truncateds + + +@DeveloperAPI +def remove_last_ts_from_data( + episode_lens: List[int], + *data: Tuple[np._typing.NDArray], +) -> Tuple[np._typing.NDArray]: + """Removes the last timesteps from each given data item. + + Each item in data is a concatenated sequence of episodes data. + For example if `episode_lens` is [2, 4], then data is a shape=(6,) + ndarray. The returned corresponding value will have shape (4,), meaning + both episodes have been shortened by exactly one timestep to 1 and 3. + + ..testcode:: + + from ray.rllib.algorithms.ppo.ppo_learner import PPOLearner + import numpy as np + + unpadded = PPOLearner._remove_last_ts_from_data( + [5, 3], + np.array([0, 1, 2, 3, 4, 0, 1, 2]), + ) + assert (unpadded[0] == [0, 1, 2, 3, 0, 1]).all() + + unpadded = PPOLearner._remove_last_ts_from_data( + [4, 2, 3], + np.array([0, 1, 2, 3, 0, 1, 0, 1, 2]), + np.array([4, 5, 6, 7, 2, 3, 3, 4, 5]), + ) + assert (unpadded[0] == [0, 1, 2, 0, 0, 1]).all() + assert (unpadded[1] == [4, 5, 6, 2, 3, 4]).all() + + Args: + episode_lens: A list of current episode lengths. The returned + data will have the same lengths minus 1 timestep. + data: A tuple of data items (np.ndarrays) representing concatenated episodes + to be shortened by one timestep per episode. + Note that only arrays with `shape=(n,)` are supported! The + returned data will have `shape=(n-len(episode_lens),)` (each + episode gets shortened by one timestep). + + Returns: + A tuple of new data items shortened by one timestep. + """ + # Figure out the new slices to apply to each data item based on + # the given episode_lens. + slices = [] + sum = 0 + for len_ in episode_lens: + slices.append(slice(sum, sum + len_ - 1)) + sum += len_ + # Compiling return data by slicing off one timestep at the end of + # each episode. + ret = [] + for d in data: + ret.append(np.concatenate([d[s] for s in slices])) + return tuple(ret) if len(ret) > 1 else ret[0] + + +@DeveloperAPI +def remove_last_ts_from_episodes_and_restore_truncateds( + episodes: List[SingleAgentEpisode], + orig_truncateds: List[bool], +) -> None: + """Reverts the effects of `_add_ts_to_episodes_and_truncate`. + + Args: + episodes: The list of SingleAgentEpisode objects to extend by one timestep + and add a truncation flag if necessary. + orig_truncateds: A list of the original episodes' truncated values to be + applied to the `episodes`. + """ + + # Fix all episodes. + for episode, orig_truncated in zip(episodes, orig_truncateds): + # Reduce timesteps by 1. + episode.t -= 1 + # Remove all extra timestep data from the episode's buffers. + episode.observations.pop() + episode.infos.pop() + episode.actions.pop() + episode.rewards.pop() + for v in episode.extra_model_outputs.values(): + v.pop() + # Fix the truncateds flag again. + episode.is_truncated = orig_truncated diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/utils/postprocessing/value_predictions.py b/.venv/lib/python3.11/site-packages/ray/rllib/utils/postprocessing/value_predictions.py new file mode 100644 index 0000000000000000000000000000000000000000..0c0f88e7f49e9a2969ebdc2fc2887fedcf1f3e32 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/rllib/utils/postprocessing/value_predictions.py @@ -0,0 +1,108 @@ +import numpy as np + +from ray.util.annotations import DeveloperAPI + + +@DeveloperAPI +def compute_value_targets( + values, + rewards, + terminateds, + truncateds, + gamma: float, + lambda_: float, +): + """Computes value function (vf) targets given vf predictions and rewards. + + Note that advantages can then easily be computed via the formula: + advantages = targets - vf_predictions + """ + # Force-set all values at terminals (not at truncations!) to 0.0. + orig_values = flat_values = values * (1.0 - terminateds) + + flat_values = np.append(flat_values, 0.0) + intermediates = rewards + gamma * (1 - lambda_) * flat_values[1:] + continues = 1.0 - terminateds + + Rs = [] + last = flat_values[-1] + for t in reversed(range(intermediates.shape[0])): + last = intermediates[t] + continues[t] * gamma * lambda_ * last + Rs.append(last) + if truncateds[t]: + last = orig_values[t] + + # Reverse back to correct (time) direction. + value_targets = np.stack(list(reversed(Rs)), axis=0) + + return value_targets.astype(np.float32) + + +def extract_bootstrapped_values(vf_preds, episode_lengths, T): + """Returns a bootstrapped value batch given value predictions. + + Note that the incoming value predictions must have happened over (artificially) + elongated episodes (by 1 timestep at the end). This way, we can either extract the + `vf_preds` at these extra timesteps (as "bootstrap values") or skip over them + entirely if they lie in the middle of the T-slices. + + For example, given an episodes structure like this: + 01234a 0123456b 01c 012- 0123e 012- + where each episode is separated by a space and goes from 0 to n and ends in an + artificially elongated timestep (denoted by 'a', 'b', 'c', '-', or 'e'), where '-' + means that the episode was terminated and the bootstrap value at the end should be + zero and 'a', 'b', 'c', etc.. represent truncated episode ends with computed vf + estimates. + The output for the above sequence (and T=4) should then be: + 4 3 b 2 3 - + + Args: + vf_preds: The computed value function predictions over the artificially + elongated episodes (by one timestep at the end). + episode_lengths: The original (correct) episode lengths, NOT counting the + artificially added timestep at the end. + T: The size of the time dimension by which to slice the data. Note that the + sum of all episode lengths (`sum(episode_lengths)`) must be dividable by T. + + Returns: + The batch of bootstrapped values. + """ + bootstrapped_values = [] + if sum(episode_lengths) % T != 0: + raise ValueError( + "Can only extract bootstrapped values if the sum of episode lengths " + f"({sum(episode_lengths)}) is dividable by the given T ({T})!" + ) + + # Loop over all episode lengths and collect bootstrap values. + # Do not alter incoming `episode_lengths` list. + episode_lengths = episode_lengths[:] + i = -1 + while i < len(episode_lengths) - 1: + i += 1 + eps_len = episode_lengths[i] + # We can make another T-stride inside this episode -> + # - Use a vf prediction within the episode as bootstrapped value. + # - "Fix" the episode_lengths array and continue within the same episode. + if T < eps_len: + bootstrapped_values.append(vf_preds[T]) + vf_preds = vf_preds[T:] + episode_lengths[i] -= T + i -= 1 + # We can make another T-stride inside this episode, but will then be at the end + # of it -> + # - Use the value function prediction at the artificially added timestep + # as bootstrapped value. + # - Skip the additional timestep at the end and ,ove on with next episode. + elif T == eps_len: + bootstrapped_values.append(vf_preds[T]) + vf_preds = vf_preds[T + 1 :] + # The episode fits entirely into the T-stride -> + # - Move on to next episode ("fix" its length by make it seemingly longer). + else: + # Skip bootstrap value of current episode (not needed). + vf_preds = vf_preds[1:] + # Make next episode seem longer. + episode_lengths[i + 1] += eps_len + + return np.array(bootstrapped_values) diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/utils/postprocessing/zero_padding.py b/.venv/lib/python3.11/site-packages/ray/rllib/utils/postprocessing/zero_padding.py new file mode 100644 index 0000000000000000000000000000000000000000..08017213d8af231a52e3abd9d42bb0479b4f1c6f --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/rllib/utils/postprocessing/zero_padding.py @@ -0,0 +1,234 @@ +from collections import deque +from typing import List, Tuple, Union + +import numpy as np +import tree # pip install dm_tree + +from ray.rllib.utils.spaces.space_utils import batch, BatchedNdArray +from ray.util.annotations import DeveloperAPI + + +@DeveloperAPI +def create_mask_and_seq_lens(episode_len: int, T: int) -> Tuple[List, List]: + """Creates loss mask and a seq_lens array, given an episode length and T. + + Args: + episode_lens: A list of episode lengths to infer the loss mask and seq_lens + array from. + T: The maximum number of timesteps in each "row", also known as the maximum + sequence length (max_seq_len). Episodes are split into chunks that are at + most `T` long and remaining timesteps will be zero-padded (and masked out). + + Returns: + Tuple consisting of a) list of the loss masks to use (masking out areas that + are past the end of an episode (or rollout), but had to be zero-added due to + the added extra time rank (of length T) and b) the list of sequence lengths + resulting from splitting the given episodes into chunks of at most `T` + timesteps. + """ + mask = [] + seq_lens = [] + + len_ = min(episode_len, T) + seq_lens.append(len_) + row = np.array([1] * len_ + [0] * (T - len_), np.bool_) + mask.append(row) + + # Handle sequence lengths greater than T. + overflow = episode_len - T + while overflow > 0: + len_ = min(overflow, T) + seq_lens.append(len_) + extra_row = np.array([1] * len_ + [0] * (T - len_), np.bool_) + mask.append(extra_row) + overflow -= T + + return mask, seq_lens + + +@DeveloperAPI +def split_and_zero_pad( + item_list: List[Union[BatchedNdArray, np._typing.NDArray, float]], + max_seq_len: int, +) -> List[np._typing.NDArray]: + """Splits the contents of `item_list` into a new list of ndarrays and returns it. + + In the returned list, each item is one ndarray of len (axis=0) `max_seq_len`. + The last item in the returned list may be (right) zero-padded, if necessary, to + reach `max_seq_len`. + + If `item_list` contains one or more `BatchedNdArray` (instead of individual + items), these will be split accordingly along their axis=0 to yield the returned + structure described above. + + .. testcode:: + + from ray.rllib.utils.postprocessing.zero_padding import ( + BatchedNdArray, + split_and_zero_pad, + ) + from ray.rllib.utils.test_utils import check + + # Simple case: `item_list` contains individual floats. + check( + split_and_zero_pad([0, 1, 2, 3, 4, 5, 6, 7], 5), + [[0, 1, 2, 3, 4], [5, 6, 7, 0, 0]], + ) + + # `item_list` contains BatchedNdArray (ndarrays that explicitly declare they + # have a batch axis=0). + check( + split_and_zero_pad([ + BatchedNdArray([0, 1]), + BatchedNdArray([2, 3, 4, 5]), + BatchedNdArray([6, 7, 8]), + ], 5), + [[0, 1, 2, 3, 4], [5, 6, 7, 8, 0]], + ) + + Args: + item_list: A list of individual items or BatchedNdArrays to be split into + `max_seq_len` long pieces (the last of which may be zero-padded). + max_seq_len: The maximum length of each item in the returned list. + + Returns: + A list of np.ndarrays (all of length `max_seq_len`), which contains the same + data as `item_list`, but split into sub-chunks of size `max_seq_len`. + The last item in the returned list may be zero-padded, if necessary. + """ + zero_element = tree.map_structure( + lambda s: np.zeros_like([s[0]] if isinstance(s, BatchedNdArray) else s), + item_list[0], + ) + + # The replacement list (to be returned) for `items_list`. + # Items list contains n individual items. + # -> ret will contain m batched rows, where m == n // T and the last row + # may be zero padded (until T). + ret = [] + + # List of the T-axis item, collected to form the next row. + current_time_row = [] + current_t = 0 + + item_list = deque(item_list) + while len(item_list) > 0: + item = item_list.popleft() + # `item` is already a batched np.array: Split if necessary. + if isinstance(item, BatchedNdArray): + t = max_seq_len - current_t + current_time_row.append(item[:t]) + if len(item) <= t: + current_t += len(item) + else: + current_t += t + item_list.appendleft(item[t:]) + # `item` is a single item (no batch axis): Append and continue with next item. + else: + current_time_row.append(item) + current_t += 1 + + # `current_time_row` is "full" (max_seq_len): Append as ndarray (with batch + # axis) to `ret`. + if current_t == max_seq_len: + ret.append( + batch( + current_time_row, + individual_items_already_have_batch_dim="auto", + ) + ) + current_time_row = [] + current_t = 0 + + # `current_time_row` is unfinished: Pad, if necessary and append to `ret`. + if current_t > 0 and current_t < max_seq_len: + current_time_row.extend([zero_element] * (max_seq_len - current_t)) + ret.append( + batch(current_time_row, individual_items_already_have_batch_dim="auto") + ) + + return ret + + +@DeveloperAPI +def split_and_zero_pad_n_episodes(nd_array, episode_lens, max_seq_len): + ret = [] + + # item_list = deque(item_list) + cursor = 0 + for episode_len in episode_lens: + # episode_item_list = [] + items = BatchedNdArray(nd_array[cursor : cursor + episode_len]) + # episode_item_list.append(items) + ret.extend(split_and_zero_pad([items], max_seq_len)) + cursor += episode_len + + return ret + + +@DeveloperAPI +def unpad_data_if_necessary(episode_lens, data): + """Removes right-side zero-padding from data based on `episode_lens`. + + ..testcode:: + + from ray.rllib.utils.postprocessing.zero_padding import unpad_data_if_necessary + import numpy as np + + unpadded = unpad_data_if_necessary( + episode_lens=[4, 2], + data=np.array([ + [2, 4, 5, 3, 0, 0, 0, 0], + [-1, 3, 0, 0, 0, 0, 0, 0], + ]), + ) + assert (unpadded == [2, 4, 5, 3, -1, 3]).all() + + unpadded = unpad_data_if_necessary( + episode_lens=[1, 5], + data=np.array([ + [2, 0, 0, 0, 0], + [-1, -2, -3, -4, -5], + ]), + ) + assert (unpadded == [2, -1, -2, -3, -4, -5]).all() + + Args: + episode_lens: A list of actual episode lengths. + data: A 2D np.ndarray with right-side zero-padded rows. + + Returns: + A 1D np.ndarray resulting from concatenation of the un-padded + input data along the 0-axis. + """ + # If data des NOT have time dimension, return right away. + if len(data.shape) == 1: + return data + + # Assert we only have B and T dimensions (meaning this function only operates + # on single-float data, such as value function predictions, advantages, or rewards). + assert len(data.shape) == 2 + + new_data = [] + row_idx = 0 + + T = data.shape[1] + for len_ in episode_lens: + # Calculate how many full rows this array occupies and how many elements are + # in the last, potentially partial row. + num_rows, col_idx = divmod(len_, T) + + # If the array spans multiple full rows, fully include these rows. + for i in range(num_rows): + new_data.append(data[row_idx]) + row_idx += 1 + + # If there are elements in the last, potentially partial row, add this + # partial row as well. + if col_idx > 0: + new_data.append(data[row_idx, :col_idx]) + + # Move to the next row for the next array (skip the zero-padding zone). + row_idx += 1 + + return np.concatenate(new_data) diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/utils/pre_checks/__init__.py b/.venv/lib/python3.11/site-packages/ray/rllib/utils/pre_checks/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/utils/pre_checks/__pycache__/__init__.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/utils/pre_checks/__pycache__/__init__.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..02768ed91309661a09ad01380a4fe066dea230bc Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/utils/pre_checks/__pycache__/__init__.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/utils/pre_checks/__pycache__/env.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/utils/pre_checks/__pycache__/env.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..c8449a6d61e062205df11e365cb544ea2e3c3d56 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/utils/pre_checks/__pycache__/env.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/utils/pre_checks/env.py b/.venv/lib/python3.11/site-packages/ray/rllib/utils/pre_checks/env.py new file mode 100644 index 0000000000000000000000000000000000000000..7055f08695fb3b9a945def9556d226766baddad3 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/rllib/utils/pre_checks/env.py @@ -0,0 +1,288 @@ +"""Common pre-checks for all RLlib experiments.""" +import logging +from typing import TYPE_CHECKING, Set + +import gymnasium as gym +import numpy as np +import tree # pip install dm_tree + +from ray.rllib.utils.annotations import DeveloperAPI +from ray.rllib.utils.error import ERR_MSG_OLD_GYM_API, UnsupportedSpaceException +from ray.rllib.utils.spaces.space_utils import get_base_struct_from_space +from ray.util import log_once + +if TYPE_CHECKING: + from ray.rllib.env import MultiAgentEnv + +logger = logging.getLogger(__name__) + + +@DeveloperAPI +def check_multiagent_environments(env: "MultiAgentEnv") -> None: + """Checking for common errors in RLlib MultiAgentEnvs. + + Args: + env: The env to be checked. + """ + from ray.rllib.env import MultiAgentEnv + + if not isinstance(env, MultiAgentEnv): + raise ValueError("The passed env is not a MultiAgentEnv.") + elif not ( + hasattr(env, "observation_space") + and hasattr(env, "action_space") + and hasattr(env, "_agent_ids") + ): + if log_once("ma_env_super_ctor_called"): + logger.warning( + f"Your MultiAgentEnv {env} does not have some or all of the needed " + "base-class attributes! Make sure you call `super().__init__()` from " + "within your MutiAgentEnv's constructor. " + "This will raise an error in the future." + ) + return + + try: + obs_and_infos = env.reset(seed=42, options={}) + except Exception as e: + raise ValueError( + ERR_MSG_OLD_GYM_API.format( + env, "In particular, the `reset()` method seems to be faulty." + ) + ) from e + reset_obs, reset_infos = obs_and_infos + + _check_if_element_multi_agent_dict(env, reset_obs, "reset()") + + sampled_action = { + aid: env.get_action_space(aid).sample() for aid in reset_obs.keys() + } + _check_if_element_multi_agent_dict( + env, sampled_action, "get_action_space(agent_id=..).sample()" + ) + + try: + results = env.step(sampled_action) + except Exception as e: + raise ValueError( + ERR_MSG_OLD_GYM_API.format( + env, "In particular, the `step()` method seems to be faulty." + ) + ) from e + next_obs, reward, done, truncated, info = results + + _check_if_element_multi_agent_dict(env, next_obs, "step, next_obs") + _check_if_element_multi_agent_dict(env, reward, "step, reward") + _check_if_element_multi_agent_dict(env, done, "step, done") + _check_if_element_multi_agent_dict(env, truncated, "step, truncated") + _check_if_element_multi_agent_dict(env, info, "step, info", allow_common=True) + _check_reward({"dummy_env_id": reward}, base_env=True, agent_ids=env.agents) + _check_done_and_truncated( + {"dummy_env_id": done}, + {"dummy_env_id": truncated}, + base_env=True, + agent_ids=env.agents, + ) + _check_info({"dummy_env_id": info}, base_env=True, agent_ids=env.agents) + + +def _check_reward(reward, base_env=False, agent_ids=None): + if base_env: + for _, multi_agent_dict in reward.items(): + for agent_id, rew in multi_agent_dict.items(): + if not ( + np.isreal(rew) + and not isinstance(rew, bool) + and ( + np.isscalar(rew) + or (isinstance(rew, np.ndarray) and rew.shape == ()) + ) + ): + error = ( + "Your step function must return rewards that are" + f" integer or float. reward: {rew}. Instead it was a " + f"{type(rew)}" + ) + raise ValueError(error) + if not (agent_id in agent_ids or agent_id == "__all__"): + error = ( + f"Your reward dictionary must have agent ids that belong to " + f"the environment. AgentIDs received from " + f"env.agents are: {agent_ids}" + ) + raise ValueError(error) + elif not ( + np.isreal(reward) + and not isinstance(reward, bool) + and ( + np.isscalar(reward) + or (isinstance(reward, np.ndarray) and reward.shape == ()) + ) + ): + error = ( + "Your step function must return a reward that is integer or float. " + "Instead it was a {}".format(type(reward)) + ) + raise ValueError(error) + + +def _check_done_and_truncated(done, truncated, base_env=False, agent_ids=None): + for what in ["done", "truncated"]: + data = done if what == "done" else truncated + if base_env: + for _, multi_agent_dict in data.items(): + for agent_id, done_ in multi_agent_dict.items(): + if not isinstance(done_, (bool, np.bool_)): + raise ValueError( + f"Your step function must return `{what}s` that are " + f"boolean. But instead was a {type(data)}" + ) + if not (agent_id in agent_ids or agent_id == "__all__"): + error = ( + f"Your `{what}s` dictionary must have agent ids that " + f"belong to the environment. AgentIDs received from " + f"env.agents are: {agent_ids}" + ) + raise ValueError(error) + elif not isinstance(data, (bool, np.bool_)): + error = ( + f"Your step function must return a `{what}` that is a boolean. But " + f"instead was a {type(data)}" + ) + raise ValueError(error) + + +def _check_info(info, base_env=False, agent_ids=None): + if base_env: + for _, multi_agent_dict in info.items(): + for agent_id, inf in multi_agent_dict.items(): + if not isinstance(inf, dict): + raise ValueError( + "Your step function must return infos that are a dict. " + f"instead was a {type(inf)}: element: {inf}" + ) + if not ( + agent_id in agent_ids + or agent_id == "__all__" + or agent_id == "__common__" + ): + error = ( + f"Your dones dictionary must have agent ids that belong to " + f"the environment. AgentIDs received from " + f"env.agents are: {agent_ids}" + ) + raise ValueError(error) + elif not isinstance(info, dict): + error = ( + "Your step function must return a info that " + f"is a dict. element type: {type(info)}. element: {info}" + ) + raise ValueError(error) + + +def _not_contained_error(func_name, _type): + _error = ( + f"The {_type} collected from {func_name} was not contained within" + f" your env's {_type} space. Its possible that there was a type" + f"mismatch (for example {_type}s of np.float32 and a space of" + f"np.float64 {_type}s), or that one of the sub-{_type}s was" + f"out of bounds" + ) + return _error + + +def _check_if_element_multi_agent_dict( + env, + element, + function_string, + base_env=False, + allow_common=False, +): + if not isinstance(element, dict): + if base_env: + error = ( + f"The element returned by {function_string} contains values " + f"that are not MultiAgentDicts. Instead, they are of " + f"type: {type(element)}" + ) + else: + error = ( + f"The element returned by {function_string} is not a " + f"MultiAgentDict. Instead, it is of type: " + f" {type(element)}" + ) + raise ValueError(error) + agent_ids: Set = set(env.agents) + agent_ids.add("__all__") + if allow_common: + agent_ids.add("__common__") + + if not all(k in agent_ids for k in element): + if base_env: + error = ( + f"The element returned by {function_string} has agent_ids" + f" that are not the names of the agents in the env." + f"agent_ids in this\nMultiEnvDict:" + f" {list(element.keys())}\nAgentIDs in this env: " + f"{env.agents}" + ) + else: + error = ( + f"The element returned by {function_string} has agent_ids" + f" that are not the names of the agents in the env. " + f"\nAgentIDs in this MultiAgentDict: " + f"{list(element.keys())}\nAgentIDs in this env: " + f"{env.agents}. You likely need to add the attribute `agents` to your " + f"env, which is a list containing the IDs of agents currently in your " + f"env/episode, as well as, `possible_agents`, which is a list of all " + f"possible agents that could ever show up in your env." + ) + raise ValueError(error) + + +def _find_offending_sub_space(space, value): + """Returns error, value, and space when offending `space.contains(value)` fails. + + Returns only the offending sub-value/sub-space in case `space` is a complex Tuple + or Dict space. + + Args: + space: The gym.Space to check. + value: The actual (numpy) value to check for matching `space`. + + Returns: + Tuple consisting of 1) key-sequence of the offending sub-space or the empty + string if `space` is not complex (Tuple or Dict), 2) the offending sub-space, + 3) the offending sub-space's dtype, 4) the offending sub-value, 5) the offending + sub-value's dtype. + + .. testcode:: + :skipif: True + + path, space, space_dtype, value, value_dtype = _find_offending_sub_space( + gym.spaces.Dict({ + -2.0, 1.5, (2, ), np.int8), np.array([-1.5, 3.0]) + ) + + """ + if not isinstance(space, (gym.spaces.Dict, gym.spaces.Tuple)): + return None, space, space.dtype, value, _get_type(value) + + structured_space = get_base_struct_from_space(space) + + def map_fn(p, s, v): + if not s.contains(v): + raise UnsupportedSpaceException((p, s, v)) + + try: + tree.map_structure_with_path(map_fn, structured_space, value) + except UnsupportedSpaceException as e: + space, value = e.args[0][1], e.args[0][2] + return "->".join(e.args[0][0]), space, space.dtype, value, _get_type(value) + + # This is actually an error. + return None, None, None, None, None + + +def _get_type(var): + return var.dtype if hasattr(var, "dtype") else type(var) diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/utils/serialization.py b/.venv/lib/python3.11/site-packages/ray/rllib/utils/serialization.py new file mode 100644 index 0000000000000000000000000000000000000000..30eb1aacc5d466f623d8efbeaaac64dec5787f9a --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/rllib/utils/serialization.py @@ -0,0 +1,418 @@ +import base64 +from collections import OrderedDict +import importlib +import io +import zlib +from typing import Any, Dict, Optional, Sequence, Type, Union + +import gymnasium as gym +import numpy as np + +import ray +from ray.rllib.utils.annotations import DeveloperAPI +from ray.rllib.utils.error import NotSerializable +from ray.rllib.utils.spaces.flexdict import FlexDict +from ray.rllib.utils.spaces.repeated import Repeated +from ray.rllib.utils.spaces.simplex import Simplex + +NOT_SERIALIZABLE = "__not_serializable__" + + +@DeveloperAPI +def convert_numpy_to_python_primitives(obj: Any): + """Convert an object that is a numpy type to a python type. + + If the object is not a numpy type, it is returned unchanged. + + Args: + obj: The object to convert. + """ + if isinstance(obj, np.integer): + return int(obj) + elif isinstance(obj, np.floating): + return float(obj) + elif isinstance(obj, np.bool_): + return bool(obj) + elif isinstance(obj, np.str_): + return str(obj) + elif isinstance(obj, np.ndarray): + ret = obj.tolist() + for i, v in enumerate(ret): + ret[i] = convert_numpy_to_python_primitives(v) + return ret + else: + return obj + + +def _serialize_ndarray(array: np.ndarray) -> str: + """Pack numpy ndarray into Base64 encoded strings for serialization. + + This function uses numpy.save() instead of pickling to ensure + compatibility. + + Args: + array: numpy ndarray. + + Returns: + b64 escaped string. + """ + buf = io.BytesIO() + np.save(buf, array) + return base64.b64encode(zlib.compress(buf.getvalue())).decode("ascii") + + +def _deserialize_ndarray(b64_string: str) -> np.ndarray: + """Unpack b64 escaped string into numpy ndarray. + + This function assumes the unescaped bytes are of npy format. + + Args: + b64_string: Base64 escaped string. + + Returns: + numpy ndarray. + """ + return np.load( + io.BytesIO(zlib.decompress(base64.b64decode(b64_string))), allow_pickle=True + ) + + +@DeveloperAPI +def gym_space_to_dict(space: gym.spaces.Space) -> Dict: + """Serialize a gym Space into a JSON-serializable dict. + + Args: + space: gym.spaces.Space + + Returns: + Serialized JSON string. + """ + if space is None: + return None + + def _box(sp: gym.spaces.Box) -> Dict: + return { + "space": "box", + "low": _serialize_ndarray(sp.low), + "high": _serialize_ndarray(sp.high), + "shape": sp._shape, # shape is a tuple. + "dtype": sp.dtype.str, + } + + def _discrete(sp: gym.spaces.Discrete) -> Dict: + d = { + "space": "discrete", + "n": int(sp.n), + } + # Offset is a relatively new Discrete space feature. + if hasattr(sp, "start"): + d["start"] = int(sp.start) + return d + + def _multi_binary(sp: gym.spaces.MultiBinary) -> Dict: + return { + "space": "multi-binary", + "n": sp.n, + } + + def _multi_discrete(sp: gym.spaces.MultiDiscrete) -> Dict: + return { + "space": "multi-discrete", + "nvec": _serialize_ndarray(sp.nvec), + "dtype": sp.dtype.str, + } + + def _tuple(sp: gym.spaces.Tuple) -> Dict: + return { + "space": "tuple", + "spaces": [gym_space_to_dict(sp) for sp in sp.spaces], + } + + def _dict(sp: gym.spaces.Dict) -> Dict: + return { + "space": "dict", + "spaces": {k: gym_space_to_dict(sp) for k, sp in sp.spaces.items()}, + } + + def _simplex(sp: Simplex) -> Dict: + return { + "space": "simplex", + "shape": sp._shape, # shape is a tuple. + "concentration": sp.concentration, + "dtype": sp.dtype.str, + } + + def _repeated(sp: Repeated) -> Dict: + return { + "space": "repeated", + "child_space": gym_space_to_dict(sp.child_space), + "max_len": sp.max_len, + } + + def _flex_dict(sp: FlexDict) -> Dict: + d = { + "space": "flex_dict", + } + for k, s in sp.spaces: + d[k] = gym_space_to_dict(s) + return d + + def _text(sp: "gym.spaces.Text") -> Dict: + # Note (Kourosh): This only works in gym >= 0.25.0 + charset = getattr(sp, "character_set", None) + if charset is None: + charset = getattr(sp, "charset", None) + if charset is None: + raise ValueError( + "Text space must have a character_set or charset attribute" + ) + return { + "space": "text", + "min_length": sp.min_length, + "max_length": sp.max_length, + "charset": charset, + } + + if isinstance(space, gym.spaces.Box): + return _box(space) + elif isinstance(space, gym.spaces.Discrete): + return _discrete(space) + elif isinstance(space, gym.spaces.MultiBinary): + return _multi_binary(space) + elif isinstance(space, gym.spaces.MultiDiscrete): + return _multi_discrete(space) + elif isinstance(space, gym.spaces.Tuple): + return _tuple(space) + elif isinstance(space, gym.spaces.Dict): + return _dict(space) + elif isinstance(space, gym.spaces.Text): + return _text(space) + elif isinstance(space, Simplex): + return _simplex(space) + elif isinstance(space, Repeated): + return _repeated(space) + elif isinstance(space, FlexDict): + return _flex_dict(space) + else: + raise ValueError("Unknown space type for serialization, ", type(space)) + + +@DeveloperAPI +def space_to_dict(space: gym.spaces.Space) -> Dict: + d = {"space": gym_space_to_dict(space)} + if "original_space" in space.__dict__: + d["original_space"] = space_to_dict(space.original_space) + return d + + +@DeveloperAPI +def gym_space_from_dict(d: Dict) -> gym.spaces.Space: + """De-serialize a dict into gym Space. + + Args: + str: serialized JSON str. + + Returns: + De-serialized gym space. + """ + if d is None: + return None + + def __common(d: Dict): + """Common updates to the dict before we use it to construct spaces""" + ret = d.copy() + del ret["space"] + if "dtype" in ret: + ret["dtype"] = np.dtype(ret["dtype"]) + return ret + + def _box(d: Dict) -> gym.spaces.Box: + ret = d.copy() + ret.update( + { + "low": _deserialize_ndarray(d["low"]), + "high": _deserialize_ndarray(d["high"]), + } + ) + return gym.spaces.Box(**__common(ret)) + + def _discrete(d: Dict) -> gym.spaces.Discrete: + return gym.spaces.Discrete(**__common(d)) + + def _multi_binary(d: Dict) -> gym.spaces.MultiBinary: + return gym.spaces.MultiBinary(**__common(d)) + + def _multi_discrete(d: Dict) -> gym.spaces.MultiDiscrete: + ret = d.copy() + ret.update( + { + "nvec": _deserialize_ndarray(ret["nvec"]), + } + ) + return gym.spaces.MultiDiscrete(**__common(ret)) + + def _tuple(d: Dict) -> gym.spaces.Discrete: + spaces = [gym_space_from_dict(sp) for sp in d["spaces"]] + return gym.spaces.Tuple(spaces=spaces) + + def _dict(d: Dict) -> gym.spaces.Discrete: + # We need to always use an OrderedDict here to cover the following two ways, by + # which a user might construct a Dict space originally. We need to restore this + # original Dict space with the exact order of keys the user intended to. + # - User provides an OrderedDict inside the gym.spaces.Dict constructor -> + # gymnasium should NOT further sort the keys. The same (user-provided) order + # must be restored. + # - User provides a simple dict inside the gym.spaces.Dict constructor -> + # By its API definition, gymnasium automatically sorts all keys alphabetically. + # The same (alphabetical) order must thus be restored. + spaces = OrderedDict( + {k: gym_space_from_dict(sp) for k, sp in d["spaces"].items()} + ) + return gym.spaces.Dict(spaces=spaces) + + def _simplex(d: Dict) -> Simplex: + return Simplex(**__common(d)) + + def _repeated(d: Dict) -> Repeated: + child_space = gym_space_from_dict(d["child_space"]) + return Repeated(child_space=child_space, max_len=d["max_len"]) + + def _flex_dict(d: Dict) -> FlexDict: + spaces = {k: gym_space_from_dict(s) for k, s in d.items() if k != "space"} + return FlexDict(spaces=spaces) + + def _text(d: Dict) -> "gym.spaces.Text": + return gym.spaces.Text(**__common(d)) + + space_map = { + "box": _box, + "discrete": _discrete, + "multi-binary": _multi_binary, + "multi-discrete": _multi_discrete, + "tuple": _tuple, + "dict": _dict, + "simplex": _simplex, + "repeated": _repeated, + "flex_dict": _flex_dict, + "text": _text, + } + + space_type = d["space"] + if space_type not in space_map: + raise ValueError("Unknown space type for de-serialization, ", space_type) + + return space_map[space_type](d) + + +@DeveloperAPI +def space_from_dict(d: Dict) -> gym.spaces.Space: + space = gym_space_from_dict(d["space"]) + if "original_space" in d: + assert "space" in d["original_space"] + if isinstance(d["original_space"]["space"], str): + # For backward compatibility reasons, if d["original_space"]["space"] + # is a string, this original space was serialized by gym_space_to_dict. + space.original_space = gym_space_from_dict(d["original_space"]) + else: + # Otherwise, this original space was serialized by space_to_dict. + space.original_space = space_from_dict(d["original_space"]) + return space + + +@DeveloperAPI +def check_if_args_kwargs_serializable(args: Sequence[Any], kwargs: Dict[str, Any]): + """Check if parameters to a function are serializable by ray. + + Args: + args: arguments to be checked. + kwargs: keyword arguments to be checked. + + Raises: + NoteSerializable if either args are kwargs are not serializable + by ray. + """ + for arg in args: + try: + # if the object is truly serializable we should be able to + # ray.put and ray.get it. + ray.get(ray.put(arg)) + except TypeError as e: + raise NotSerializable( + "RLModule constructor arguments must be serializable. " + f"Found non-serializable argument: {arg}.\n" + f"Original serialization error: {e}" + ) + for k, v in kwargs.items(): + try: + # if the object is truly serializable we should be able to + # ray.put and ray.get it. + ray.get(ray.put(v)) + except TypeError as e: + raise NotSerializable( + "RLModule constructor arguments must be serializable. " + f"Found non-serializable keyword argument: {k} = {v}.\n" + f"Original serialization error: {e}" + ) + + +@DeveloperAPI +def serialize_type(type_: Union[Type, str]) -> str: + """Converts a type into its full classpath ([module file] + "." + [class name]). + + Args: + type_: The type to convert. + + Returns: + The full classpath of the given type, e.g. "ray.rllib.algorithms.ppo.PPOConfig". + """ + # TODO (avnishn): find a way to incorporate the tune registry here. + # Already serialized. + if isinstance(type_, str): + return type_ + + return type_.__module__ + "." + type_.__qualname__ + + +@DeveloperAPI +def deserialize_type( + module: Union[str, Type], error: bool = False +) -> Optional[Union[str, Type]]: + """Resolves a class path to a class. + If the given module is already a class, it is returned as is. + If the given module is a string, it is imported and the class is returned. + + Args: + module: The classpath (str) or type to resolve. + error: Whether to throw a ValueError if `module` could not be resolved into + a class. If False and `module` is not resolvable, returns None. + + Returns: + The resolved class or `module` (if `error` is False and no resolution possible). + + Raises: + ValueError: If `error` is True and `module` cannot be resolved. + """ + # Already a class, return as-is. + if isinstance(module, type): + return module + # A string. + elif isinstance(module, str): + # Try interpreting (as classpath) and importing the given module. + try: + module_path, class_name = module.rsplit(".", 1) + module = importlib.import_module(module_path) + return getattr(module, class_name) + # Module not found OR not a module (but a registered string?). + except (ModuleNotFoundError, ImportError, AttributeError, ValueError) as e: + # Ignore if error=False. + if error: + raise ValueError( + f"Could not deserialize the given classpath `module={module}` into " + "a valid python class! Make sure you have all necessary pip " + "packages installed and all custom modules are in your " + "`PYTHONPATH` env variable." + ) from e + else: + raise ValueError(f"`module` ({module} must be type or string (classpath)!") + + return module diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/utils/tensor_dtype.py b/.venv/lib/python3.11/site-packages/ray/rllib/utils/tensor_dtype.py new file mode 100644 index 0000000000000000000000000000000000000000..83677d80a46a8048011863f65cf8b851d12dbd9e --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/rllib/utils/tensor_dtype.py @@ -0,0 +1,65 @@ +import numpy as np + +from ray.rllib.utils.typing import TensorType +from ray.rllib.utils.framework import try_import_torch, try_import_tf +from ray.util.annotations import DeveloperAPI + +torch, _ = try_import_torch() +_, tf, _ = try_import_tf() + + +# Dict of NumPy dtype -> torch dtype +if torch: + numpy_to_torch_dtype_dict = { + np.bool_: torch.bool, + np.uint8: torch.uint8, + np.int8: torch.int8, + np.int16: torch.int16, + np.int32: torch.int32, + np.int64: torch.int64, + np.float16: torch.float16, + np.float32: torch.float32, + np.float64: torch.float64, + np.complex64: torch.complex64, + np.complex128: torch.complex128, + } +else: + numpy_to_torch_dtype_dict = {} + +# Dict of NumPy dtype -> tf dtype +if tf: + numpy_to_tf_dtype_dict = { + np.bool_: tf.bool, + np.uint8: tf.uint8, + np.int8: tf.int8, + np.int16: tf.int16, + np.int32: tf.int32, + np.int64: tf.int64, + np.float16: tf.float16, + np.float32: tf.float32, + np.float64: tf.float64, + np.complex64: tf.complex64, + np.complex128: tf.complex128, + } +else: + numpy_to_tf_dtype_dict = {} + +# Dict of torch dtype -> NumPy dtype +torch_to_numpy_dtype_dict = { + value: key for (key, value) in numpy_to_torch_dtype_dict.items() +} +# Dict of tf dtype -> NumPy dtype +tf_to_numpy_dtype_dict = {value: key for (key, value) in numpy_to_tf_dtype_dict.items()} + + +@DeveloperAPI +def get_np_dtype(x: TensorType) -> np.dtype: + """Returns the NumPy dtype of the given tensor or array.""" + if torch and isinstance(x, torch.Tensor): + return torch_to_numpy_dtype_dict[x.dtype] + if tf and isinstance(x, tf.Tensor): + return tf_to_numpy_dtype_dict[x.dtype] + elif isinstance(x, np.ndarray): + return x.dtype + else: + raise TypeError("Unsupported type: {}".format(type(x))) diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/utils/test_utils.py b/.venv/lib/python3.11/site-packages/ray/rllib/utils/test_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..9f739ee9aa1c86b9d138582a19e025297fef3efa --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/rllib/utils/test_utils.py @@ -0,0 +1,1847 @@ +import argparse +import json +import logging +import os +import pprint +import random +import re +import time +from typing import ( + TYPE_CHECKING, + Any, + Dict, + List, + Optional, + Tuple, + Type, + Union, +) + +import gymnasium as gym +from gymnasium.spaces import Box, Discrete, MultiDiscrete, MultiBinary +from gymnasium.spaces import Dict as GymDict +from gymnasium.spaces import Tuple as GymTuple +import numpy as np +import tree # pip install dm_tree + +import ray +from ray import train, tune +from ray.air.constants import TRAINING_ITERATION +from ray.air.integrations.wandb import WandbLoggerCallback, WANDB_ENV_VAR +from ray.rllib.core import DEFAULT_MODULE_ID, Columns +from ray.rllib.env.wrappers.atari_wrappers import is_atari, wrap_deepmind +from ray.rllib.utils.annotations import OldAPIStack +from ray.rllib.utils.framework import try_import_jax, try_import_tf, try_import_torch +from ray.rllib.utils.metrics import ( + DIFF_NUM_GRAD_UPDATES_VS_SAMPLER_POLICY, + ENV_RUNNER_RESULTS, + EPISODE_RETURN_MEAN, + EVALUATION_RESULTS, + NUM_ENV_STEPS_TRAINED, + NUM_ENV_STEPS_SAMPLED_LIFETIME, +) +from ray.rllib.utils.typing import ResultDict +from ray.rllib.utils.error import UnsupportedSpaceException + + +from ray.tune import CLIReporter + + +if TYPE_CHECKING: + from ray.rllib.algorithms import Algorithm, AlgorithmConfig + from ray.rllib.offline.dataset_reader import DatasetReader + +jax, _ = try_import_jax() +tf1, tf, tfv = try_import_tf() +torch, _ = try_import_torch() + +logger = logging.getLogger(__name__) + + +def add_rllib_example_script_args( + parser: Optional[argparse.ArgumentParser] = None, + default_reward: float = 100.0, + default_iters: int = 200, + default_timesteps: int = 100000, +) -> argparse.ArgumentParser: + """Adds RLlib-typical (and common) examples scripts command line args to a parser. + + TODO (sven): This function should be used by most of our examples scripts, which + already mostly have this logic in them (but written out). + + Args: + parser: The parser to add the arguments to. If None, create a new one. + default_reward: The default value for the --stop-reward option. + default_iters: The default value for the --stop-iters option. + default_timesteps: The default value for the --stop-timesteps option. + + Returns: + The altered (or newly created) parser object. + """ + if parser is None: + parser = argparse.ArgumentParser() + + # Algo and Algo config options. + parser.add_argument( + "--algo", type=str, default="PPO", help="The RLlib-registered algorithm to use." + ) + parser.add_argument( + "--enable-new-api-stack", + action="store_true", + help="Whether to use the `enable_rl_module_and_learner` config setting.", + ) + parser.add_argument( + "--framework", + choices=["tf", "tf2", "torch"], + default="torch", + help="The DL framework specifier.", + ) + parser.add_argument( + "--env", + type=str, + default=None, + help="The gym.Env identifier to run the experiment with.", + ) + parser.add_argument( + "--num-env-runners", + type=int, + default=None, + help="The number of (remote) EnvRunners to use for the experiment.", + ) + parser.add_argument( + "--num-envs-per-env-runner", + type=int, + default=None, + help="The number of (vectorized) environments per EnvRunner. Note that " + "this is identical to the batch size for (inference) action computations.", + ) + parser.add_argument( + "--num-agents", + type=int, + default=0, + help="If 0 (default), will run as single-agent. If > 0, will run as " + "multi-agent with the environment simply cloned n times and each agent acting " + "independently at every single timestep. The overall reward for this " + "experiment is then the sum over all individual agents' rewards.", + ) + + # Evaluation options. + parser.add_argument( + "--evaluation-num-env-runners", + type=int, + default=0, + help="The number of evaluation (remote) EnvRunners to use for the experiment.", + ) + parser.add_argument( + "--evaluation-interval", + type=int, + default=0, + help="Every how many iterations to run one round of evaluation. " + "Use 0 (default) to disable evaluation.", + ) + parser.add_argument( + "--evaluation-duration", + type=lambda v: v if v == "auto" else int(v), + default=10, + help="The number of evaluation units to run each evaluation round. " + "Use `--evaluation-duration-unit` to count either in 'episodes' " + "or 'timesteps'. If 'auto', will run as many as possible during train pass (" + "`--evaluation-parallel-to-training` must be set then).", + ) + parser.add_argument( + "--evaluation-duration-unit", + type=str, + default="episodes", + choices=["episodes", "timesteps"], + help="The evaluation duration unit to count by. One of 'episodes' or " + "'timesteps'. This unit will be run `--evaluation-duration` times in each " + "evaluation round. If `--evaluation-duration=auto`, this setting does not " + "matter.", + ) + parser.add_argument( + "--evaluation-parallel-to-training", + action="store_true", + help="Whether to run evaluation parallel to training. This might help speed up " + "your overall iteration time. Be aware that when using this option, your " + "reported evaluation results are referring to one iteration before the current " + "one.", + ) + + # RLlib logging options. + parser.add_argument( + "--output", + type=str, + default=None, + help="The output directory to write trajectories to, which are collected by " + "the algo's EnvRunners.", + ) + parser.add_argument( + "--log-level", + type=str, + default=None, # None -> use default + choices=["INFO", "DEBUG", "WARN", "ERROR"], + help="The log-level to be used by the RLlib logger.", + ) + + # tune.Tuner options. + parser.add_argument( + "--no-tune", + action="store_true", + help="Whether to NOT use tune.Tuner(), but rather a simple for-loop calling " + "`algo.train()` repeatedly until one of the stop criteria is met.", + ) + parser.add_argument( + "--num-samples", + type=int, + default=1, + help="How many (tune.Tuner.fit()) experiments to execute - if possible in " + "parallel.", + ) + parser.add_argument( + "--max-concurrent-trials", + type=int, + default=None, + help="How many (tune.Tuner) trials to run concurrently.", + ) + parser.add_argument( + "--verbose", + type=int, + default=2, + help="The verbosity level for the `tune.Tuner()` running the experiment.", + ) + parser.add_argument( + "--checkpoint-freq", + type=int, + default=0, + help=( + "The frequency (in training iterations) with which to create checkpoints. " + "Note that if --wandb-key is provided, all checkpoints will " + "automatically be uploaded to WandB." + ), + ) + parser.add_argument( + "--checkpoint-at-end", + action="store_true", + help=( + "Whether to create a checkpoint at the very end of the experiment. " + "Note that if --wandb-key is provided, all checkpoints will " + "automatically be uploaded to WandB." + ), + ) + + # WandB logging options. + parser.add_argument( + "--wandb-key", + type=str, + default=None, + help="The WandB API key to use for uploading results.", + ) + parser.add_argument( + "--wandb-project", + type=str, + default=None, + help="The WandB project name to use.", + ) + parser.add_argument( + "--wandb-run-name", + type=str, + default=None, + help="The WandB run name to use.", + ) + + # Experiment stopping and testing criteria. + parser.add_argument( + "--stop-reward", + type=float, + default=default_reward, + help="Reward at which the script should stop training.", + ) + parser.add_argument( + "--stop-iters", + type=int, + default=default_iters, + help="The number of iterations to train.", + ) + parser.add_argument( + "--stop-timesteps", + type=int, + default=default_timesteps, + help="The number of (environment sampling) timesteps to train.", + ) + parser.add_argument( + "--as-test", + action="store_true", + help="Whether this script should be run as a test. If set, --stop-reward must " + "be achieved within --stop-timesteps AND --stop-iters, otherwise this " + "script will throw an exception at the end.", + ) + parser.add_argument( + "--as-release-test", + action="store_true", + help="Whether this script should be run as a release test. If set, " + "all that applies to the --as-test option is true, plus, a short JSON summary " + "will be written into a results file whose location is given by the ENV " + "variable `TEST_OUTPUT_JSON`.", + ) + + # Learner scaling options. + parser.add_argument( + "--num-learners", + type=int, + default=None, + help="The number of Learners to use. If `None`, use the algorithm's default " + "value.", + ) + parser.add_argument( + "--num-gpus-per-learner", + type=float, + default=None, + help="The number of GPUs per Learner to use. If `None` and there are enough " + "GPUs for all required Learners (--num-learners), use a value of 1, " + "otherwise 0.", + ) + parser.add_argument( + "--num-aggregator-actors-per-learner", + type=int, + default=None, + help="The number of Aggregator actors to use per Learner. If `None`, use the " + "algorithm's default value.", + ) + + # Ray init options. + parser.add_argument("--num-cpus", type=int, default=0) + parser.add_argument( + "--local-mode", + action="store_true", + help="Init Ray in local mode for easier debugging.", + ) + + # Old API stack: config.num_gpus. + parser.add_argument( + "--num-gpus", + type=int, + default=None, + help="The number of GPUs to use (only on the old API stack).", + ) + + return parser + + +def check(x, y, decimals=5, atol=None, rtol=None, false=False): + """ + Checks two structures (dict, tuple, list, + np.array, float, int, etc..) for (almost) numeric identity. + All numbers in the two structures have to match up to `decimal` digits + after the floating point. Uses assertions. + + Args: + x: The value to be compared (to the expectation: `y`). This + may be a Tensor. + y: The expected value to be compared to `x`. This must not + be a tf-Tensor, but may be a tf/torch-Tensor. + decimals: The number of digits after the floating point up to + which all numeric values have to match. + atol: Absolute tolerance of the difference between x and y + (overrides `decimals` if given). + rtol: Relative tolerance of the difference between x and y + (overrides `decimals` if given). + false: Whether to check that x and y are NOT the same. + """ + # A dict type. + if isinstance(x, dict): + assert isinstance(y, dict), "ERROR: If x is dict, y needs to be a dict as well!" + y_keys = set(x.keys()) + for key, value in x.items(): + assert key in y, f"ERROR: y does not have x's key='{key}'! y={y}" + check(value, y[key], decimals=decimals, atol=atol, rtol=rtol, false=false) + y_keys.remove(key) + assert not y_keys, "ERROR: y contains keys ({}) that are not in x! y={}".format( + list(y_keys), y + ) + # A tuple type. + elif isinstance(x, (tuple, list)): + assert isinstance( + y, (tuple, list) + ), "ERROR: If x is tuple/list, y needs to be a tuple/list as well!" + assert len(y) == len( + x + ), "ERROR: y does not have the same length as x ({} vs {})!".format( + len(y), len(x) + ) + for i, value in enumerate(x): + check(value, y[i], decimals=decimals, atol=atol, rtol=rtol, false=false) + # Boolean comparison. + elif isinstance(x, (np.bool_, bool)): + if false is True: + assert bool(x) is not bool(y), f"ERROR: x ({x}) is y ({y})!" + else: + assert bool(x) is bool(y), f"ERROR: x ({x}) is not y ({y})!" + # Nones or primitives (excluding int vs float, which should be compared with + # tolerance/decimals as well). + elif ( + x is None + or y is None + or isinstance(x, str) + or (isinstance(x, int) and isinstance(y, int)) + ): + if false is True: + assert x != y, f"ERROR: x ({x}) is the same as y ({y})!" + else: + assert x == y, f"ERROR: x ({x}) is not the same as y ({y})!" + # String/byte comparisons. + elif ( + hasattr(x, "dtype") and (x.dtype == object or str(x.dtype).startswith(" raise error (not expected to be equal). + if false is True: + assert False, f"ERROR: x ({x}) is the same as y ({y})!" + + # Using atol/rtol. + else: + # Provide defaults for either one of atol/rtol. + if atol is None: + atol = 0 + if rtol is None: + rtol = 1e-7 + try: + np.testing.assert_allclose(x, y, atol=atol, rtol=rtol) + except AssertionError as e: + if false is False: + raise e + else: + if false is True: + assert False, f"ERROR: x ({x}) is the same as y ({y})!" + + +def check_compute_single_action( + algorithm, include_state=False, include_prev_action_reward=False +): + """Tests different combinations of args for algorithm.compute_single_action. + + Args: + algorithm: The Algorithm object to test. + include_state: Whether to include the initial state of the Policy's + Model in the `compute_single_action` call. + include_prev_action_reward: Whether to include the prev-action and + -reward in the `compute_single_action` call. + + Raises: + ValueError: If anything unexpected happens. + """ + # Have to import this here to avoid circular dependency. + from ray.rllib.policy.sample_batch import DEFAULT_POLICY_ID, SampleBatch + + # Some Algorithms may not abide to the standard API. + pid = DEFAULT_POLICY_ID + try: + # Multi-agent: Pick any learnable policy (or DEFAULT_POLICY if it's the only + # one). + pid = next(iter(algorithm.env_runner.get_policies_to_train())) + pol = algorithm.get_policy(pid) + except AttributeError: + pol = algorithm.policy + # Get the policy's model. + model = pol.model + + action_space = pol.action_space + + def _test( + what, method_to_test, obs_space, full_fetch, explore, timestep, unsquash, clip + ): + call_kwargs = {} + if what is algorithm: + call_kwargs["full_fetch"] = full_fetch + call_kwargs["policy_id"] = pid + + obs = obs_space.sample() + if isinstance(obs_space, Box): + obs = np.clip(obs, -1.0, 1.0) + state_in = None + if include_state: + state_in = model.get_initial_state() + if not state_in: + state_in = [] + i = 0 + while f"state_in_{i}" in model.view_requirements: + state_in.append( + model.view_requirements[f"state_in_{i}"].space.sample() + ) + i += 1 + action_in = action_space.sample() if include_prev_action_reward else None + reward_in = 1.0 if include_prev_action_reward else None + + if method_to_test == "input_dict": + assert what is pol + + input_dict = {SampleBatch.OBS: obs} + if include_prev_action_reward: + input_dict[SampleBatch.PREV_ACTIONS] = action_in + input_dict[SampleBatch.PREV_REWARDS] = reward_in + if state_in: + if what.config.get("enable_rl_module_and_learner", False): + input_dict["state_in"] = state_in + else: + for i, s in enumerate(state_in): + input_dict[f"state_in_{i}"] = s + input_dict_batched = SampleBatch( + tree.map_structure(lambda s: np.expand_dims(s, 0), input_dict) + ) + action = pol.compute_actions_from_input_dict( + input_dict=input_dict_batched, + explore=explore, + timestep=timestep, + **call_kwargs, + ) + # Unbatch everything to be able to compare against single + # action below. + # ARS and ES return action batches as lists. + if isinstance(action[0], list): + action = (np.array(action[0]), action[1], action[2]) + action = tree.map_structure(lambda s: s[0], action) + + try: + action2 = pol.compute_single_action( + input_dict=input_dict, + explore=explore, + timestep=timestep, + **call_kwargs, + ) + # Make sure these are the same, unless we have exploration + # switched on (or noisy layers). + if not explore and not pol.config.get("noisy"): + check(action, action2) + except TypeError: + pass + else: + action = what.compute_single_action( + obs, + state_in, + prev_action=action_in, + prev_reward=reward_in, + explore=explore, + timestep=timestep, + unsquash_action=unsquash, + clip_action=clip, + **call_kwargs, + ) + + state_out = None + if state_in or full_fetch or what is pol: + action, state_out, _ = action + if state_out: + for si, so in zip(tree.flatten(state_in), tree.flatten(state_out)): + if tf.is_tensor(si): + # If si is a tensor of Dimensions, we need to convert it + # We expect this to be the case for TF RLModules who's initial + # states are Tf Tensors. + si_shape = si.shape.as_list() + else: + si_shape = list(si.shape) + check(si_shape, so.shape) + + if unsquash is None: + unsquash = what.config["normalize_actions"] + if clip is None: + clip = what.config["clip_actions"] + + # Test whether unsquash/clipping works on the Algorithm's + # compute_single_action method: Both flags should force the action + # to be within the space's bounds. + if method_to_test == "single" and what == algorithm: + if not action_space.contains(action) and ( + clip or unsquash or not isinstance(action_space, Box) + ): + raise ValueError( + f"Returned action ({action}) of algorithm/policy {what} " + f"not in Env's action_space {action_space}" + ) + # We are operating in normalized space: Expect only smaller action + # values. + if ( + isinstance(action_space, Box) + and not unsquash + and what.config.get("normalize_actions") + and np.any(np.abs(action) > 15.0) + ): + raise ValueError( + f"Returned action ({action}) of algorithm/policy {what} " + "should be in normalized space, but seems too large/small " + "for that!" + ) + + # Loop through: Policy vs Algorithm; Different API methods to calculate + # actions; unsquash option; clip option; full fetch or not. + for what in [pol, algorithm]: + if what is algorithm: + # Get the obs-space from Workers.env (not Policy) due to possible + # pre-processor up front. + worker_set = getattr(algorithm, "env_runner_group", None) + assert worker_set + if not worker_set.local_env_runner: + obs_space = algorithm.get_policy(pid).observation_space + else: + obs_space = worker_set.local_env_runner.for_policy( + lambda p: p.observation_space, policy_id=pid + ) + obs_space = getattr(obs_space, "original_space", obs_space) + else: + obs_space = pol.observation_space + + for method_to_test in ["single"] + (["input_dict"] if what is pol else []): + for explore in [True, False]: + for full_fetch in [False, True] if what is algorithm else [False]: + timestep = random.randint(0, 100000) + for unsquash in [True, False, None]: + for clip in [False] if unsquash else [True, False, None]: + print("-" * 80) + print(f"what={what}") + print(f"method_to_test={method_to_test}") + print(f"explore={explore}") + print(f"full_fetch={full_fetch}") + print(f"unsquash={unsquash}") + print(f"clip={clip}") + _test( + what, + method_to_test, + obs_space, + full_fetch, + explore, + timestep, + unsquash, + clip, + ) + + +def check_inference_w_connectors(policy, env_name, max_steps: int = 100): + """Checks whether the given policy can infer actions from an env with connectors. + + Args: + policy: The policy to check. + env_name: Name of the environment to check + max_steps: The maximum number of steps to run the environment for. + + Raises: + ValueError: If the policy cannot infer actions from the environment. + """ + # Avoids circular import + from ray.rllib.utils.policy import local_policy_inference + + env = gym.make(env_name) + + # Potentially wrap the env like we do in RolloutWorker + if is_atari(env): + env = wrap_deepmind( + env, + dim=policy.config["model"]["dim"], + framestack=policy.config["model"].get("framestack"), + ) + + obs, info = env.reset() + reward, terminated, truncated = 0.0, False, False + ts = 0 + while not terminated and not truncated and ts < max_steps: + action_out = local_policy_inference( + policy, + env_id=0, + agent_id=0, + obs=obs, + reward=reward, + terminated=terminated, + truncated=truncated, + info=info, + ) + obs, reward, terminated, truncated, info = env.step(action_out[0][0]) + + ts += 1 + + +def check_learning_achieved( + tune_results: "tune.ResultGrid", + min_value: float, + evaluation: Optional[bool] = None, + metric: str = f"{ENV_RUNNER_RESULTS}/episode_return_mean", +): + """Throws an error if `min_reward` is not reached within tune_results. + + Checks the last iteration found in tune_results for its + "episode_return_mean" value and compares it to `min_reward`. + + Args: + tune_results: The tune.Tuner().fit() returned results object. + min_reward: The min reward that must be reached. + evaluation: If True, use `evaluation/env_runners/[metric]`, if False, use + `env_runners/[metric]`, if None, use evaluation sampler results if + available otherwise, use train sampler results. + + Raises: + ValueError: If `min_reward` not reached. + """ + # Get maximum value of `metrics` over all trials + # (check if at least one trial achieved some learning, not just the final one). + recorded_values = [] + for _, row in tune_results.get_dataframe().iterrows(): + if evaluation or ( + evaluation is None and f"{EVALUATION_RESULTS}/{metric}" in row + ): + recorded_values.append(row[f"{EVALUATION_RESULTS}/{metric}"]) + else: + recorded_values.append(row[metric]) + best_value = max(recorded_values) + if best_value < min_value: + raise ValueError(f"`{metric}` of {min_value} not reached!") + print(f"`{metric}` of {min_value} reached! ok") + + +def check_off_policyness( + results: ResultDict, + upper_limit: float, + lower_limit: float = 0.0, +) -> Optional[float]: + """Verifies that the off-policy'ness of some update is within some range. + + Off-policy'ness is defined as the average (across n workers) diff + between the number of gradient updates performed on the policy used + for sampling vs the number of gradient updates that have been performed + on the trained policy (usually the one on the local worker). + + Uses the published DIFF_NUM_GRAD_UPDATES_VS_SAMPLER_POLICY metric inside + a training results dict and compares to the given bounds. + + Note: Only works with single-agent results thus far. + + Args: + results: The training results dict. + upper_limit: The upper limit to for the off_policy_ness value. + lower_limit: The lower limit to for the off_policy_ness value. + + Returns: + The off-policy'ness value (described above). + + Raises: + AssertionError: If the value is out of bounds. + """ + + # Have to import this here to avoid circular dependency. + from ray.rllib.policy.sample_batch import DEFAULT_POLICY_ID + from ray.rllib.utils.metrics.learner_info import LEARNER_INFO + + # Assert that the off-policy'ness is within the given bounds. + learner_info = results["info"][LEARNER_INFO] + if DEFAULT_POLICY_ID not in learner_info: + return None + off_policy_ness = learner_info[DEFAULT_POLICY_ID][ + DIFF_NUM_GRAD_UPDATES_VS_SAMPLER_POLICY + ] + # Roughly: Reaches up to 0.4 for 2 rollout workers and up to 0.2 for + # 1 rollout worker. + if not (lower_limit <= off_policy_ness <= upper_limit): + raise AssertionError( + f"`off_policy_ness` ({off_policy_ness}) is outside the given bounds " + f"({lower_limit} - {upper_limit})!" + ) + + return off_policy_ness + + +def check_train_results_new_api_stack(train_results: ResultDict) -> None: + """Checks proper structure of a Algorithm.train() returned dict. + + Args: + train_results: The train results dict to check. + + Raises: + AssertionError: If `train_results` doesn't have the proper structure or + data in it. + """ + # Import these here to avoid circular dependencies. + from ray.rllib.utils.metrics import ( + ENV_RUNNER_RESULTS, + FAULT_TOLERANCE_STATS, + LEARNER_RESULTS, + TIMERS, + ) + + # Assert that some keys are where we would expect them. + for key in [ + ENV_RUNNER_RESULTS, + FAULT_TOLERANCE_STATS, + LEARNER_RESULTS, + TIMERS, + TRAINING_ITERATION, + "config", + ]: + assert ( + key in train_results + ), f"'{key}' not found in `train_results` ({train_results})!" + + # Make sure, `config` is an actual dict, not an AlgorithmConfig object. + assert isinstance( + train_results["config"], dict + ), "`config` in results not a python dict!" + + from ray.rllib.algorithms.algorithm_config import AlgorithmConfig + + is_multi_agent = ( + AlgorithmConfig() + .update_from_dict({"policies": train_results["config"]["policies"]}) + .is_multi_agent + ) + + # Check in particular the "info" dict. + learner_results = train_results[LEARNER_RESULTS] + + # Make sure we have a `DEFAULT_MODULE_ID key if we are not in a + # multi-agent setup. + if not is_multi_agent: + assert len(learner_results) == 0 or DEFAULT_MODULE_ID in learner_results, ( + f"'{DEFAULT_MODULE_ID}' not found in " + f"train_results['{LEARNER_RESULTS}']!" + ) + + for module_id, module_metrics in learner_results.items(): + # The ModuleID can be __all_modules__ in multi-agent case when the new learner + # stack is enabled. + if module_id == "__all_modules__": + continue + + # On the new API stack, policy has no LEARNER_STATS_KEY under it anymore. + for key, value in module_metrics.items(): + # Min- and max-stats should be single values. + if key.endswith("_min") or key.endswith("_max"): + assert np.isscalar(value), f"'key' value not a scalar ({value})!" + + return train_results + + +@OldAPIStack +def check_train_results(train_results: ResultDict): + """Checks proper structure of a Algorithm.train() returned dict. + + Args: + train_results: The train results dict to check. + + Raises: + AssertionError: If `train_results` doesn't have the proper structure or + data in it. + """ + # Import these here to avoid circular dependencies. + from ray.rllib.policy.sample_batch import DEFAULT_POLICY_ID + from ray.rllib.utils.metrics.learner_info import LEARNER_INFO, LEARNER_STATS_KEY + + # Assert that some keys are where we would expect them. + for key in [ + "config", + "custom_metrics", + ENV_RUNNER_RESULTS, + "info", + "iterations_since_restore", + "num_healthy_workers", + "perf", + "time_since_restore", + "time_this_iter_s", + "timers", + "time_total_s", + TRAINING_ITERATION, + ]: + assert ( + key in train_results + ), f"'{key}' not found in `train_results` ({train_results})!" + + for key in [ + "episode_len_mean", + "episode_reward_max", + "episode_reward_mean", + "episode_reward_min", + "hist_stats", + "policy_reward_max", + "policy_reward_mean", + "policy_reward_min", + "sampler_perf", + ]: + assert key in train_results[ENV_RUNNER_RESULTS], ( + f"'{key}' not found in `train_results[ENV_RUNNER_RESULTS]` " + f"({train_results[ENV_RUNNER_RESULTS]})!" + ) + + # Make sure, `config` is an actual dict, not an AlgorithmConfig object. + assert isinstance( + train_results["config"], dict + ), "`config` in results not a python dict!" + + from ray.rllib.algorithms.algorithm_config import AlgorithmConfig + + is_multi_agent = ( + AlgorithmConfig() + .update_from_dict({"policies": train_results["config"]["policies"]}) + .is_multi_agent + ) + + # Check in particular the "info" dict. + info = train_results["info"] + assert LEARNER_INFO in info, f"'learner' not in train_results['infos'] ({info})!" + assert ( + "num_steps_trained" in info or NUM_ENV_STEPS_TRAINED in info + ), f"'num_(env_)?steps_trained' not in train_results['infos'] ({info})!" + + learner_info = info[LEARNER_INFO] + + # Make sure we have a default_policy key if we are not in a + # multi-agent setup. + if not is_multi_agent: + # APEX algos sometimes have an empty learner info dict (no metrics + # collected yet). + assert len(learner_info) == 0 or DEFAULT_POLICY_ID in learner_info, ( + f"'{DEFAULT_POLICY_ID}' not found in " + f"train_results['infos']['learner'] ({learner_info})!" + ) + + for pid, policy_stats in learner_info.items(): + if pid == "batch_count": + continue + + # the pid can be __all__ in multi-agent case when the new learner stack is + # enabled. + if pid == "__all__": + continue + + # On the new API stack, policy has no LEARNER_STATS_KEY under it anymore. + if LEARNER_STATS_KEY in policy_stats: + learner_stats = policy_stats[LEARNER_STATS_KEY] + else: + learner_stats = policy_stats + for key, value in learner_stats.items(): + # Min- and max-stats should be single values. + if key.startswith("min_") or key.startswith("max_"): + assert np.isscalar(value), f"'key' value not a scalar ({value})!" + + return train_results + + +# TODO (sven): Make this the de-facto, well documented, and unified utility for most of +# our tests: +# - CI (label: "learning_tests") +# - release tests (benchmarks) +# - example scripts +def run_rllib_example_script_experiment( + base_config: "AlgorithmConfig", + args: Optional[argparse.Namespace] = None, + *, + stop: Optional[Dict] = None, + success_metric: Optional[Dict] = None, + trainable: Optional[Type] = None, + tune_callbacks: Optional[List] = None, + keep_config: bool = False, + keep_ray_up: bool = False, + scheduler=None, + progress_reporter=None, +) -> Union[ResultDict, tune.result_grid.ResultGrid]: + """Given an algorithm config and some command line args, runs an experiment. + + There are some constraints on what properties must be defined in `args`. + It should ideally be generated via calling + `args = add_rllib_example_script_args()`, which can be found in this very module + here. + + The function sets up an Algorithm object from the given config (altered by the + contents of `args`), then runs the Algorithm via Tune (or manually, if + `args.no_tune` is set to True) using the stopping criteria in `stop`. + + At the end of the experiment, if `args.as_test` is True, checks, whether the + Algorithm reached the `success_metric` (if None, use `env_runners/ + episode_return_mean` with a minimum value of `args.stop_reward`). + + See https://github.com/ray-project/ray/tree/master/rllib/examples for an overview + of all supported command line options. + + Args: + base_config: The AlgorithmConfig object to use for this experiment. This base + config will be automatically "extended" based on some of the provided + `args`. For example, `args.num_env_runners` is used to set + `config.num_env_runners`, etc.. + args: A argparse.Namespace object, ideally returned by calling + `args = add_rllib_example_script_args()`. It must have the following + properties defined: `stop_iters`, `stop_reward`, `stop_timesteps`, + `no_tune`, `verbose`, `checkpoint_freq`, `as_test`. Optionally, for WandB + logging: `wandb_key`, `wandb_project`, `wandb_run_name`. + stop: An optional dict mapping ResultDict key strings (using "/" in case of + nesting, e.g. "env_runners/episode_return_mean" for referring to + `result_dict['env_runners']['episode_return_mean']` to minimum + values, reaching of which will stop the experiment). Default is: + { + "env_runners/episode_return_mean": args.stop_reward, + "training_iteration": args.stop_iters, + "num_env_steps_sampled_lifetime": args.stop_timesteps, + } + success_metric: Only relevant if `args.as_test` is True. + A dict mapping a single(!) ResultDict key string (using "/" in + case of nesting, e.g. "env_runners/episode_return_mean" for referring + to `result_dict['env_runners']['episode_return_mean']` to a single(!) + minimum value to be reached in order for the experiment to count as + successful. If `args.as_test` is True AND this `success_metric` is not + reached with the bounds defined by `stop`, will raise an Exception. + trainable: The Trainable sub-class to run in the tune.Tuner. If None (default), + use the registered RLlib Algorithm class specified by args.algo. + tune_callbacks: A list of Tune callbacks to configure with the tune.Tuner. + In case `args.wandb_key` is provided, appends a WandB logger to this + list. + keep_config: Set this to True, if you don't want this utility to change the + given `base_config` in any way and leave it as-is. This is helpful + for those example scripts which demonstrate how to set config settings + that are otherwise taken care of automatically in this function (e.g. + `num_env_runners`). + + Returns: + The last ResultDict from a --no-tune run OR the tune.Tuner.fit() + results. + """ + if args is None: + parser = add_rllib_example_script_args() + args = parser.parse_args() + + # If run --as-release-test, --as-test must also be set. + if args.as_release_test: + args.as_test = True + + # Initialize Ray. + ray.init( + num_cpus=args.num_cpus or None, + local_mode=args.local_mode, + ignore_reinit_error=True, + ) + + # Define one or more stopping criteria. + if stop is None: + stop = { + f"{ENV_RUNNER_RESULTS}/{EPISODE_RETURN_MEAN}": args.stop_reward, + f"{ENV_RUNNER_RESULTS}/{NUM_ENV_STEPS_SAMPLED_LIFETIME}": ( + args.stop_timesteps + ), + TRAINING_ITERATION: args.stop_iters, + } + + config = base_config + + # Enhance the `base_config`, based on provided `args`. + if not keep_config: + # Set the framework. + config.framework(args.framework) + + # Add an env specifier (only if not already set in config)? + if args.env is not None and config.env is None: + config.environment(args.env) + + # Disable the new API stack? + if not args.enable_new_api_stack: + config.api_stack( + enable_rl_module_and_learner=False, + enable_env_runner_and_connector_v2=False, + ) + + # Define EnvRunner scaling and behavior. + if args.num_env_runners is not None: + config.env_runners(num_env_runners=args.num_env_runners) + if args.num_envs_per_env_runner is not None: + config.env_runners(num_envs_per_env_runner=args.num_envs_per_env_runner) + + # Define compute resources used automatically (only using the --num-learners + # and --num-gpus-per-learner args). + # New stack. + if config.enable_rl_module_and_learner: + if args.num_gpus is not None and args.num_gpus > 0: + raise ValueError( + "--num-gpus is not supported on the new API stack! To train on " + "GPUs, use the command line options `--num-gpus-per-learner=1` and " + "`--num-learners=[your number of available GPUs]`, instead." + ) + + # Do we have GPUs available in the cluster? + num_gpus_available = ray.cluster_resources().get("GPU", 0) + # Number of actual Learner instances (including the local Learner if + # `num_learners=0`). + num_actual_learners = ( + args.num_learners + if args.num_learners is not None + else config.num_learners + ) or 1 # 1: There is always a local Learner, if num_learners=0. + # How many were hard-requested by the user + # (through explicit `--num-gpus-per-learner >= 1`). + num_gpus_requested = (args.num_gpus_per_learner or 0) * num_actual_learners + # Number of GPUs needed, if `num_gpus_per_learner=None` (auto). + num_gpus_needed_if_available = ( + args.num_gpus_per_learner + if args.num_gpus_per_learner is not None + else 1 + ) * num_actual_learners + # Define compute resources used. + config.resources(num_gpus=0) # old API stack setting + if args.num_learners is not None: + config.learners(num_learners=args.num_learners) + + # User wants to use aggregator actors per Learner. + if args.num_aggregator_actors_per_learner is not None: + config.learners( + num_aggregator_actors_per_learner=( + args.num_aggregator_actors_per_learner + ) + ) + + # User wants to use GPUs if available, but doesn't hard-require them. + if args.num_gpus_per_learner is None: + if num_gpus_available >= num_gpus_needed_if_available: + config.learners(num_gpus_per_learner=1) + else: + config.learners(num_gpus_per_learner=0, num_cpus_per_learner=1) + + # User hard-requires n GPUs, but they are not available -> Error. + elif num_gpus_available < num_gpus_requested: + raise ValueError( + "You are running your script with --num-learners=" + f"{args.num_learners} and --num-gpus-per-learner=" + f"{args.num_gpus_per_learner}, but your cluster only has " + f"{num_gpus_available} GPUs! Will run " + f"with {num_gpus_available} CPU Learners instead." + ) + + # All required GPUs are available -> Use them. + else: + config.learners(num_gpus_per_learner=args.num_gpus_per_learner) + + # Old stack (override only if arg was provided by user). + elif args.num_gpus is not None: + config.resources(num_gpus=args.num_gpus) + + # Evaluation setup. + if args.evaluation_interval > 0: + config.evaluation( + evaluation_num_env_runners=args.evaluation_num_env_runners, + evaluation_interval=args.evaluation_interval, + evaluation_duration=args.evaluation_duration, + evaluation_duration_unit=args.evaluation_duration_unit, + evaluation_parallel_to_training=args.evaluation_parallel_to_training, + ) + + # Set the log-level (if applicable). + if args.log_level is not None: + config.debugging(log_level=args.log_level) + + # Set the output dir (if applicable). + if args.output is not None: + config.offline_data(output=args.output) + + # Run the experiment w/o Tune (directly operate on the RLlib Algorithm object). + if args.no_tune: + assert not args.as_test and not args.as_release_test + algo = config.build() + for i in range(stop.get(TRAINING_ITERATION, args.stop_iters)): + results = algo.train() + if ENV_RUNNER_RESULTS in results: + mean_return = results[ENV_RUNNER_RESULTS].get( + EPISODE_RETURN_MEAN, np.nan + ) + print(f"iter={i} R={mean_return}", end="") + if EVALUATION_RESULTS in results: + Reval = results[EVALUATION_RESULTS][ENV_RUNNER_RESULTS][ + EPISODE_RETURN_MEAN + ] + print(f" R(eval)={Reval}", end="") + print() + for key, threshold in stop.items(): + val = results + for k in key.split("/"): + try: + val = val[k] + except KeyError: + val = None + break + if val is not None and not np.isnan(val) and val >= threshold: + print(f"Stop criterium ({key}={threshold}) fulfilled!") + if not keep_ray_up: + ray.shutdown() + return results + + if not keep_ray_up: + ray.shutdown() + return results + + # Run the experiment using Ray Tune. + + # Log results using WandB. + tune_callbacks = tune_callbacks or [] + if hasattr(args, "wandb_key") and ( + args.wandb_key is not None or WANDB_ENV_VAR in os.environ + ): + wandb_key = args.wandb_key or os.environ[WANDB_ENV_VAR] + project = args.wandb_project or ( + args.algo.lower() + "-" + re.sub("\\W+", "-", str(config.env).lower()) + ) + tune_callbacks.append( + WandbLoggerCallback( + api_key=wandb_key, + project=project, + upload_checkpoints=True, + **({"name": args.wandb_run_name} if args.wandb_run_name else {}), + ) + ) + # Auto-configure a CLIReporter (to log the results to the console). + # Use better ProgressReporter for multi-agent cases: List individual policy rewards. + if progress_reporter is None and args.num_agents > 0: + progress_reporter = CLIReporter( + metric_columns={ + **{ + TRAINING_ITERATION: "iter", + "time_total_s": "total time (s)", + NUM_ENV_STEPS_SAMPLED_LIFETIME: "ts", + f"{ENV_RUNNER_RESULTS}/{EPISODE_RETURN_MEAN}": "combined return", + }, + **{ + ( + f"{ENV_RUNNER_RESULTS}/module_episode_returns_mean/" f"{pid}" + ): f"return {pid}" + for pid in config.policies + }, + }, + ) + + # Force Tuner to use old progress output as the new one silently ignores our custom + # `CLIReporter`. + os.environ["RAY_AIR_NEW_OUTPUT"] = "0" + + # Run the actual experiment (using Tune). + start_time = time.time() + results = tune.Tuner( + trainable or config.algo_class, + param_space=config, + run_config=train.RunConfig( + stop=stop, + verbose=args.verbose, + callbacks=tune_callbacks, + checkpoint_config=train.CheckpointConfig( + checkpoint_frequency=args.checkpoint_freq, + checkpoint_at_end=args.checkpoint_at_end, + ), + progress_reporter=progress_reporter, + ), + tune_config=tune.TuneConfig( + num_samples=args.num_samples, + max_concurrent_trials=args.max_concurrent_trials, + scheduler=scheduler, + ), + ).fit() + time_taken = time.time() - start_time + + if not keep_ray_up: + ray.shutdown() + + # Error out, if Tuner.fit() failed to run. Otherwise, erroneous examples might pass + # the CI tests w/o us knowing that they are broken (b/c some examples do not have + # a --as-test flag and/or any passing criteris). + if results.errors: + raise RuntimeError( + "Running the example script resulted in one or more errors! " + f"{[e.args[0].args[2] for e in results.errors]}" + ) + + # If run as a test, check whether we reached the specified success criteria. + test_passed = False + if args.as_test: + # Success metric not provided, try extracting it from `stop`. + if success_metric is None: + for try_it in [ + f"{EVALUATION_RESULTS}/{ENV_RUNNER_RESULTS}/{EPISODE_RETURN_MEAN}", + f"{ENV_RUNNER_RESULTS}/{EPISODE_RETURN_MEAN}", + ]: + if try_it in stop: + success_metric = {try_it: stop[try_it]} + break + if success_metric is None: + success_metric = { + f"{ENV_RUNNER_RESULTS}/{EPISODE_RETURN_MEAN}": args.stop_reward, + } + # TODO (sven): Make this work for more than one metric (AND-logic?). + # Get maximum value of `metric` over all trials + # (check if at least one trial achieved some learning, not just the final one). + success_metric_key, success_metric_value = next(iter(success_metric.items())) + best_value = max( + row[success_metric_key] for _, row in results.get_dataframe().iterrows() + ) + if best_value >= success_metric_value: + test_passed = True + print(f"`{success_metric_key}` of {success_metric_value} reached! ok") + + if args.as_release_test: + trial = results._experiment_analysis.trials[0] + stats = trial.last_result + stats.pop("config", None) + json_summary = { + "time_taken": float(time_taken), + "trial_states": [trial.status], + "last_update": float(time.time()), + "stats": stats, + "passed": [test_passed], + "not_passed": [not test_passed], + "failures": {str(trial): 1} if not test_passed else {}, + } + with open( + os.environ.get("TEST_OUTPUT_JSON", "/tmp/learning_test.json"), + "wt", + ) as f: + try: + json.dump(json_summary, f) + # Something went wrong writing json. Try again w/ simplified stats. + except Exception: + from ray.rllib.algorithms.algorithm import Algorithm + + simplified_stats = { + k: stats[k] for k in Algorithm._progress_metrics if k in stats + } + json_summary["stats"] = simplified_stats + json.dump(json_summary, f) + + if not test_passed: + raise ValueError( + f"`{success_metric_key}` of {success_metric_value} not reached!" + ) + + return results + + +def check_same_batch(batch1, batch2) -> None: + """Check if both batches are (almost) identical. + + For MultiAgentBatches, the step count and individual policy's + SampleBatches are checked for identity. For SampleBatches, identity is + checked as the almost numerical key-value-pair identity between batches + with ray.rllib.utils.test_utils.check(). unroll_id is compared only if + both batches have an unroll_id. + + Args: + batch1: Batch to compare against batch2 + batch2: Batch to compare against batch1 + """ + # Avoids circular import + from ray.rllib.policy.sample_batch import MultiAgentBatch, SampleBatch + + assert type(batch1) is type( + batch2 + ), "Input batches are of different types {} and {}".format( + str(type(batch1)), str(type(batch2)) + ) + + def check_sample_batches(_batch1, _batch2, _policy_id=None): + unroll_id_1 = _batch1.get("unroll_id", None) + unroll_id_2 = _batch2.get("unroll_id", None) + # unroll IDs only have to fit if both batches have them + if unroll_id_1 is not None and unroll_id_2 is not None: + assert unroll_id_1 == unroll_id_2 + + batch1_keys = set() + for k, v in _batch1.items(): + # unroll_id is compared above already + if k == "unroll_id": + continue + check(v, _batch2[k]) + batch1_keys.add(k) + + batch2_keys = set(_batch2.keys()) + # unroll_id is compared above already + batch2_keys.discard("unroll_id") + _difference = batch1_keys.symmetric_difference(batch2_keys) + + # Cases where one batch has info and the other has not + if _policy_id: + assert not _difference, ( + "SampleBatches for policy with ID {} " + "don't share information on the " + "following information: \n{}" + "".format(_policy_id, _difference) + ) + else: + assert not _difference, ( + "SampleBatches don't share information " + "on the following information: \n{}" + "".format(_difference) + ) + + if type(batch1) is SampleBatch: + check_sample_batches(batch1, batch2) + elif type(batch1) is MultiAgentBatch: + assert batch1.count == batch2.count + batch1_ids = set() + for policy_id, policy_batch in batch1.policy_batches.items(): + check_sample_batches( + policy_batch, batch2.policy_batches[policy_id], policy_id + ) + batch1_ids.add(policy_id) + + # Case where one ma batch has info on a policy the other has not + batch2_ids = set(batch2.policy_batches.keys()) + difference = batch1_ids.symmetric_difference(batch2_ids) + assert ( + not difference + ), f"MultiAgentBatches don't share the following information: \n{difference}." + else: + raise ValueError("Unsupported batch type " + str(type(batch1))) + + +def check_reproducibilty( + algo_class: Type["Algorithm"], + algo_config: "AlgorithmConfig", + *, + fw_kwargs: Dict[str, Any], + training_iteration: int = 1, +) -> None: + # TODO @kourosh: we can get rid of examples/deterministic_training.py once + # this is added to all algorithms + """Check if the algorithm is reproducible across different testing conditions: + + frameworks: all input frameworks + num_gpus: int(os.environ.get("RLLIB_NUM_GPUS", "0")) + num_workers: 0 (only local workers) or + 4 ((1) local workers + (4) remote workers) + num_envs_per_env_runner: 2 + + Args: + algo_class: Algorithm class to test. + algo_config: Base config to use for the algorithm. + fw_kwargs: Framework iterator keyword arguments. + training_iteration: Number of training iterations to run. + + Returns: + None + + Raises: + It raises an AssertionError if the algorithm is not reproducible. + """ + from ray.rllib.policy.sample_batch import DEFAULT_POLICY_ID + from ray.rllib.utils.metrics.learner_info import LEARNER_INFO + + stop_dict = {TRAINING_ITERATION: training_iteration} + # use 0 and 2 workers (for more that 4 workers we have to make sure the instance + # type in ci build has enough resources) + for num_workers in [0, 2]: + algo_config = ( + algo_config.debugging(seed=42).env_runners( + num_env_runners=num_workers, num_envs_per_env_runner=2 + ) + # new API + .learners( + num_gpus_per_learner=int(os.environ.get("RLLIB_NUM_GPUS", "0")), + ) + # old API + .resources( + num_gpus=int(os.environ.get("RLLIB_NUM_GPUS", "0")), + ) + ) + + print( + f"Testing reproducibility of {algo_class.__name__}" + f" with {num_workers} workers" + ) + print("/// config") + pprint.pprint(algo_config.to_dict()) + # test tune.Tuner().fit() reproducibility + results1 = tune.Tuner( + algo_class, + param_space=algo_config.to_dict(), + run_config=train.RunConfig(stop=stop_dict, verbose=1), + ).fit() + results1 = results1.get_best_result().metrics + + results2 = tune.Tuner( + algo_class, + param_space=algo_config.to_dict(), + run_config=train.RunConfig(stop=stop_dict, verbose=1), + ).fit() + results2 = results2.get_best_result().metrics + + # Test rollout behavior. + check( + results1[ENV_RUNNER_RESULTS]["hist_stats"], + results2[ENV_RUNNER_RESULTS]["hist_stats"], + ) + # As well as training behavior (minibatch sequence during SGD + # iterations). + # As well as training behavior (minibatch sequence during SGD + # iterations). + if algo_config.enable_rl_module_and_learner: + check( + results1["info"][LEARNER_INFO][DEFAULT_POLICY_ID], + results2["info"][LEARNER_INFO][DEFAULT_POLICY_ID], + ) + else: + check( + results1["info"][LEARNER_INFO][DEFAULT_POLICY_ID]["learner_stats"], + results2["info"][LEARNER_INFO][DEFAULT_POLICY_ID]["learner_stats"], + ) + + +def get_cartpole_dataset_reader(batch_size: int = 1) -> "DatasetReader": + """Returns a DatasetReader for the cartpole dataset. + Args: + batch_size: The batch size to use for the reader. + Returns: + A rllib DatasetReader for the cartpole dataset. + """ + from ray.rllib.algorithms import AlgorithmConfig + from ray.rllib.offline import IOContext + from ray.rllib.offline.dataset_reader import ( + DatasetReader, + get_dataset_and_shards, + ) + + path = "tests/data/cartpole/large.json" + input_config = {"format": "json", "paths": path} + dataset, _ = get_dataset_and_shards( + AlgorithmConfig().offline_data(input_="dataset", input_config=input_config) + ) + ioctx = IOContext( + config=( + AlgorithmConfig() + .training(train_batch_size=batch_size) + .offline_data(actions_in_input_normalized=True) + ), + worker_index=0, + ) + reader = DatasetReader(dataset, ioctx) + return reader + + +class ModelChecker: + """Helper class to compare architecturally identical Models across frameworks. + + Holds a ModelConfig, such that individual models can be added simply via their + framework string (by building them with config.build(framework=...). + A call to `check()` forces all added models to be compared in terms of their + number of trainable and non-trainable parameters, as well as, their + computation results given a common weights structure and values and identical + inputs to the models. + """ + + def __init__(self, config): + self.config = config + + # To compare number of params between frameworks. + self.param_counts = {} + # To compare computed outputs from fixed-weights-nets between frameworks. + self.output_values = {} + + # We will pass an observation filled with this one random value through + # all DL networks (after they have been set to fixed-weights) to compare + # the computed outputs. + self.random_fill_input_value = np.random.uniform(-0.01, 0.01) + + # Dict of models to check against each other. + self.models = {} + + def add(self, framework: str = "torch", obs=True, state=False) -> Any: + """Builds a new Model for the given framework.""" + model = self.models[framework] = self.config.build(framework=framework) + + # Pass a B=1 observation through the model. + inputs = np.full( + [1] + ([1] if state else []) + list(self.config.input_dims), + self.random_fill_input_value, + ) + if obs: + inputs = {Columns.OBS: inputs} + if state: + inputs[Columns.STATE_IN] = tree.map_structure( + lambda s: np.zeros(shape=[1] + list(s)), state + ) + if framework == "torch": + from ray.rllib.utils.torch_utils import convert_to_torch_tensor + + inputs = convert_to_torch_tensor(inputs) + # w/ old specs: inputs = model.input_specs.fill(self.random_fill_input_value) + + outputs = model(inputs) + + # Bring model into a reproducible, comparable state (so we can compare + # computations across frameworks). Use only a value-sequence of len=1 here + # as it could possibly be that the layers are stored in different order + # across the different frameworks. + model._set_to_dummy_weights(value_sequence=(self.random_fill_input_value,)) + + # Perform another forward pass. + comparable_outputs = model(inputs) + + # Store the number of parameters for this framework's net. + self.param_counts[framework] = model.get_num_parameters() + # Store the fixed-weights-net outputs for this framework's net. + if framework == "torch": + self.output_values[framework] = tree.map_structure( + lambda s: s.detach().numpy() if s is not None else None, + comparable_outputs, + ) + else: + self.output_values[framework] = tree.map_structure( + lambda s: s.numpy() if s is not None else None, comparable_outputs + ) + return outputs + + def check(self): + """Compares all added Models with each other and possibly raises errors.""" + + main_key = next(iter(self.models.keys())) + # Compare number of trainable and non-trainable params between all + # frameworks. + for c in self.param_counts.values(): + check(c, self.param_counts[main_key]) + + # Compare dummy outputs by exact values given that all nets received the + # same input and all nets have the same (dummy) weight values. + for v in self.output_values.values(): + check(v, self.output_values[main_key], atol=0.0005) + + +def _get_mean_action_from_algorithm(alg: "Algorithm", obs: np.ndarray) -> np.ndarray: + """Returns the mean action computed by the given algorithm. + + Note: This makes calls to `Algorithm.compute_single_action` + + Args: + alg: The constructed algorithm to run inference on. + obs: The observation to compute the action for. + + Returns: + The mean action computed by the algorithm over 5000 samples. + + """ + out = [] + for _ in range(5000): + out.append(float(alg.compute_single_action(obs))) + return np.mean(out) + + +def check_supported_spaces( + alg: str, + config: "AlgorithmConfig", + train: bool = True, + check_bounds: bool = False, + frameworks: Optional[Tuple[str]] = None, + use_gpu: bool = False, +): + """Checks whether the given algorithm supports different action and obs spaces. + + Performs the checks by constructing an rllib algorithm from the config and + checking to see that the model inside the policy is the correct one given + the action and obs spaces. For example if the action space is discrete and + the obs space is an image, then the model should be a vision network with + a categorical action distribution. + + Args: + alg: The name of the algorithm to test. + config: The config to use for the algorithm. + train: Whether to train the algorithm for a few iterations. + check_bounds: Whether to check the bounds of the action space. + frameworks: The frameworks to test the algorithm with. + use_gpu: Whether to check support for training on a gpu. + + + """ + # Do these imports here because otherwise we have circular imports. + from ray.rllib.examples.envs.classes.random_env import RandomEnv + from ray.rllib.models.torch.complex_input_net import ( + ComplexInputNetwork as TorchComplexNet, + ) + from ray.rllib.models.torch.fcnet import FullyConnectedNetwork as TorchFCNet + from ray.rllib.models.torch.visionnet import VisionNetwork as TorchVisionNet + + action_spaces_to_test = { + # Test discrete twice here until we support multi_binary action spaces + "discrete": Discrete(5), + "continuous": Box(-1.0, 1.0, (5,), dtype=np.float32), + "int_actions": Box(0, 3, (2, 3), dtype=np.int32), + "multidiscrete": MultiDiscrete([1, 2, 3, 4]), + "tuple": GymTuple( + [Discrete(2), Discrete(3), Box(-1.0, 1.0, (5,), dtype=np.float32)] + ), + "dict": GymDict( + { + "action_choice": Discrete(3), + "parameters": Box(-1.0, 1.0, (1,), dtype=np.float32), + "yet_another_nested_dict": GymDict( + {"a": GymTuple([Discrete(2), Discrete(3)])} + ), + } + ), + } + + observation_spaces_to_test = { + "multi_binary": MultiBinary([3, 10, 10]), + "discrete": Discrete(5), + "continuous": Box(-1.0, 1.0, (5,), dtype=np.float32), + "vector2d": Box(-1.0, 1.0, (5, 5), dtype=np.float32), + "image": Box(-1.0, 1.0, (84, 84, 1), dtype=np.float32), + "tuple": GymTuple([Discrete(10), Box(-1.0, 1.0, (5,), dtype=np.float32)]), + "dict": GymDict( + { + "task": Discrete(10), + "position": Box(-1.0, 1.0, (5,), dtype=np.float32), + } + ), + } + + # The observation spaces that we test RLModules with + rlmodule_supported_observation_spaces = [ + "multi_binary", + "discrete", + "continuous", + "image", + "tuple", + "dict", + ] + + # The action spaces that we test RLModules with + rlmodule_supported_action_spaces = ["discrete", "continuous"] + + default_observation_space = default_action_space = "discrete" + + config["log_level"] = "ERROR" + config["env"] = RandomEnv + + def _do_check(alg, config, a_name, o_name): + # We need to copy here so that this validation does not affect the actual + # validation method call further down the line. + config_copy = config.copy() + config_copy.validate() + # If RLModules are enabled, we need to skip a few tests for now: + if config_copy.enable_rl_module_and_learner: + # Skip PPO cases in which RLModules don't support the given spaces yet. + if o_name not in rlmodule_supported_observation_spaces: + logger.warning( + "Skipping PPO test with RLModules for obs space {}".format(o_name) + ) + return + if a_name not in rlmodule_supported_action_spaces: + logger.warning( + "Skipping PPO test with RLModules for action space {}".format( + a_name + ) + ) + return + + fw = config["framework"] + action_space = action_spaces_to_test[a_name] + obs_space = observation_spaces_to_test[o_name] + print( + "=== Testing {} (fw={}) action_space={} obs_space={} ===".format( + alg, fw, action_space, obs_space + ) + ) + t0 = time.time() + config.update_from_dict( + dict( + env_config=dict( + action_space=action_space, + observation_space=obs_space, + reward_space=Box(1.0, 1.0, shape=(), dtype=np.float32), + p_terminated=1.0, + check_action_bounds=check_bounds, + ) + ) + ) + stat = "ok" + + try: + algo = config.build() + except ray.exceptions.RayActorError as e: + if len(e.args) >= 2 and isinstance(e.args[2], UnsupportedSpaceException): + stat = "unsupported" + elif isinstance(e.args[0].args[2], UnsupportedSpaceException): + stat = "unsupported" + else: + raise + except UnsupportedSpaceException: + stat = "unsupported" + else: + if alg not in ["SAC", "PPO"]: + # 2D (image) input: Expect VisionNet. + if o_name in ["atari", "image"]: + assert isinstance(algo.get_policy().model, TorchVisionNet) + # 1D input: Expect FCNet. + elif o_name == "continuous": + assert isinstance(algo.get_policy().model, TorchFCNet) + # Could be either one: ComplexNet (if disabled Preprocessor) + # or FCNet (w/ Preprocessor). + elif o_name == "vector2d": + assert isinstance( + algo.get_policy().model, (TorchComplexNet, TorchFCNet) + ) + if train: + algo.train() + algo.stop() + print("Test: {}, ran in {}s".format(stat, time.time() - t0)) + + if not frameworks: + frameworks = ("tf2", "tf", "torch") + + _do_check_remote = ray.remote(_do_check) + _do_check_remote = _do_check_remote.options(num_gpus=1 if use_gpu else 0) + # Test all action spaces first. + for a_name in action_spaces_to_test.keys(): + o_name = default_observation_space + ray.get(_do_check_remote.remote(alg, config, a_name, o_name)) + + # Now test all observation spaces. + for o_name in observation_spaces_to_test.keys(): + a_name = default_action_space + ray.get(_do_check_remote.remote(alg, config, a_name, o_name)) diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/utils/tf_utils.py b/.venv/lib/python3.11/site-packages/ray/rllib/utils/tf_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..c32ad32d268658ff40b8846ab018c876e1f986c1 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/rllib/utils/tf_utils.py @@ -0,0 +1,812 @@ +import logging +from typing import Any, Callable, List, Optional, Type, TYPE_CHECKING, Union + +import gymnasium as gym +import numpy as np +import tree # pip install dm_tree +from gymnasium.spaces import Discrete, MultiDiscrete + +from ray.rllib.utils.annotations import PublicAPI, DeveloperAPI +from ray.rllib.utils.framework import try_import_tf +from ray.rllib.utils.numpy import SMALL_NUMBER +from ray.rllib.utils.spaces.space_utils import get_base_struct_from_space +from ray.rllib.utils.typing import ( + LocalOptimizer, + ModelGradients, + NetworkType, + PartialAlgorithmConfigDict, + SpaceStruct, + TensorStructType, + TensorType, +) + +if TYPE_CHECKING: + from ray.rllib.algorithms.algorithm_config import AlgorithmConfig + from ray.rllib.core.learner.learner import ParamDict + from ray.rllib.policy.eager_tf_policy import EagerTFPolicy + from ray.rllib.policy.eager_tf_policy_v2 import EagerTFPolicyV2 + from ray.rllib.policy.tf_policy import TFPolicy + +logger = logging.getLogger(__name__) +tf1, tf, tfv = try_import_tf() + + +@PublicAPI +def clip_gradients( + gradients_dict: "ParamDict", + *, + grad_clip: Optional[float] = None, + grad_clip_by: str, +) -> Optional[float]: + """Performs gradient clipping on a grad-dict based on a clip value and clip mode. + + Changes the provided gradient dict in place. + + Args: + gradients_dict: The gradients dict, mapping str to gradient tensors. + grad_clip: The value to clip with. The way gradients are clipped is defined + by the `grad_clip_by` arg (see below). + grad_clip_by: One of 'value', 'norm', or 'global_norm'. + + Returns: + If `grad_clip_by`="global_norm" and `grad_clip` is not None, returns the global + norm of all tensors, otherwise returns None. + """ + # No clipping, return. + if grad_clip is None: + return + + # Clip by value (each gradient individually). + if grad_clip_by == "value": + for k, v in gradients_dict.copy().items(): + gradients_dict[k] = tf.clip_by_value(v, -grad_clip, grad_clip) + + # Clip by L2-norm (per gradient tensor). + elif grad_clip_by == "norm": + for k, v in gradients_dict.copy().items(): + gradients_dict[k] = tf.clip_by_norm(v, grad_clip) + + # Clip by global L2-norm (across all gradient tensors). + else: + assert grad_clip_by == "global_norm" + + clipped_grads, global_norm = tf.clip_by_global_norm( + list(gradients_dict.values()), grad_clip + ) + for k, v in zip(gradients_dict.copy().keys(), clipped_grads): + gradients_dict[k] = v + + # Return the computed global norm scalar. + return global_norm + + +@PublicAPI +def explained_variance(y: TensorType, pred: TensorType) -> TensorType: + """Computes the explained variance for a pair of labels and predictions. + + The formula used is: + max(-1.0, 1.0 - (std(y - pred)^2 / std(y)^2)) + + Args: + y: The labels. + pred: The predictions. + + Returns: + The explained variance given a pair of labels and predictions. + """ + _, y_var = tf.nn.moments(y, axes=[0]) + _, diff_var = tf.nn.moments(y - pred, axes=[0]) + return tf.maximum(-1.0, 1 - (diff_var / (y_var + SMALL_NUMBER))) + + +@PublicAPI +def flatten_inputs_to_1d_tensor( + inputs: TensorStructType, + spaces_struct: Optional[SpaceStruct] = None, + time_axis: bool = False, +) -> TensorType: + """Flattens arbitrary input structs according to the given spaces struct. + + Returns a single 1D tensor resulting from the different input + components' values. + + Thereby: + - Boxes (any shape) get flattened to (B, [T]?, -1). Note that image boxes + are not treated differently from other types of Boxes and get + flattened as well. + - Discrete (int) values are one-hot'd, e.g. a batch of [1, 0, 3] (B=3 with + Discrete(4) space) results in [[0, 1, 0, 0], [1, 0, 0, 0], [0, 0, 0, 1]]. + - MultiDiscrete values are multi-one-hot'd, e.g. a batch of + [[0, 2], [1, 4]] (B=2 with MultiDiscrete([2, 5]) space) results in + [[1, 0, 0, 0, 1, 0, 0], [0, 1, 0, 0, 0, 0, 1]]. + + Args: + inputs: The inputs to be flattened. + spaces_struct: The structure of the spaces that behind the input + time_axis: Whether all inputs have a time-axis (after the batch axis). + If True, will keep not only the batch axis (0th), but the time axis + (1st) as-is and flatten everything from the 2nd axis up. + + Returns: + A single 1D tensor resulting from concatenating all + flattened/one-hot'd input components. Depending on the time_axis flag, + the shape is (B, n) or (B, T, n). + + .. testcode:: + :skipif: True + + # B=2 + from ray.rllib.utils.tf_utils import flatten_inputs_to_1d_tensor + from gymnasium.spaces import Discrete, Box + out = flatten_inputs_to_1d_tensor( + {"a": [1, 0], "b": [[[0.0], [0.1]], [1.0], [1.1]]}, + spaces_struct=dict(a=Discrete(2), b=Box(shape=(2, 1))) + ) + print(out) + + # B=2; T=2 + out = flatten_inputs_to_1d_tensor( + ([[1, 0], [0, 1]], + [[[0.0, 0.1], [1.0, 1.1]], [[2.0, 2.1], [3.0, 3.1]]]), + spaces_struct=tuple([Discrete(2), Box(shape=(2, ))]), + time_axis=True + ) + print(out) + + .. testoutput:: + + [[0.0, 1.0, 0.0, 0.1], [1.0, 0.0, 1.0, 1.1]] # B=2 n=4 + [[[0.0, 1.0, 0.0, 0.1], [1.0, 0.0, 1.0, 1.1]], + [[1.0, 0.0, 2.0, 2.1], [0.0, 1.0, 3.0, 3.1]]] # B=2 T=2 n=4 + """ + + flat_inputs = tree.flatten(inputs) + flat_spaces = ( + tree.flatten(spaces_struct) + if spaces_struct is not None + else [None] * len(flat_inputs) + ) + + B = None + T = None + out = [] + for input_, space in zip(flat_inputs, flat_spaces): + input_ = tf.convert_to_tensor(input_) + shape = tf.shape(input_) + # Store batch and (if applicable) time dimension. + if B is None: + B = shape[0] + if time_axis: + T = shape[1] + + # One-hot encoding. + if isinstance(space, Discrete): + if time_axis: + input_ = tf.reshape(input_, [B * T]) + out.append(tf.cast(one_hot(input_, space), tf.float32)) + elif isinstance(space, MultiDiscrete): + if time_axis: + input_ = tf.reshape(input_, [B * T, -1]) + out.append(tf.cast(one_hot(input_, space), tf.float32)) + # Flatten. + else: + if time_axis: + input_ = tf.reshape(input_, [B * T, -1]) + else: + input_ = tf.reshape(input_, [B, -1]) + out.append(tf.cast(input_, tf.float32)) + + merged = tf.concat(out, axis=-1) + # Restore the time-dimension, if applicable. + if time_axis: + merged = tf.reshape(merged, [B, T, -1]) + + return merged + + +@PublicAPI +def get_gpu_devices() -> List[str]: + """Returns a list of GPU device names, e.g. ["/gpu:0", "/gpu:1"]. + + Supports both tf1.x and tf2.x. + + Returns: + List of GPU device names (str). + """ + if tfv == 1: + from tensorflow.python.client import device_lib + + devices = device_lib.list_local_devices() + else: + try: + devices = tf.config.list_physical_devices() + except Exception: + devices = tf.config.experimental.list_physical_devices() + + # Expect "GPU", but also stuff like: "XLA_GPU". + return [d.name for d in devices if "GPU" in d.device_type] + + +@PublicAPI +def get_placeholder( + *, + space: Optional[gym.Space] = None, + value: Optional[Any] = None, + name: Optional[str] = None, + time_axis: bool = False, + flatten: bool = True, +) -> "tf1.placeholder": + """Returns a tf1.placeholder object given optional hints, such as a space. + + Note that the returned placeholder will always have a leading batch + dimension (None). + + Args: + space: An optional gym.Space to hint the shape and dtype of the + placeholder. + value: An optional value to hint the shape and dtype of the + placeholder. + name: An optional name for the placeholder. + time_axis: Whether the placeholder should also receive a time + dimension (None). + flatten: Whether to flatten the given space into a plain Box space + and then create the placeholder from the resulting space. + + Returns: + The tf1 placeholder. + """ + from ray.rllib.models.catalog import ModelCatalog + + if space is not None: + if isinstance(space, (gym.spaces.Dict, gym.spaces.Tuple)): + if flatten: + return ModelCatalog.get_action_placeholder(space, None) + else: + return tree.map_structure_with_path( + lambda path, component: get_placeholder( + space=component, + name=name + "." + ".".join([str(p) for p in path]), + ), + get_base_struct_from_space(space), + ) + return tf1.placeholder( + shape=(None,) + ((None,) if time_axis else ()) + space.shape, + dtype=tf.float32 if space.dtype == np.float64 else space.dtype, + name=name, + ) + else: + assert value is not None + shape = value.shape[1:] + return tf1.placeholder( + shape=(None,) + + ((None,) if time_axis else ()) + + (shape if isinstance(shape, tuple) else tuple(shape.as_list())), + dtype=tf.float32 if value.dtype == np.float64 else value.dtype, + name=name, + ) + + +@PublicAPI +def get_tf_eager_cls_if_necessary( + orig_cls: Type["TFPolicy"], + config: Union["AlgorithmConfig", PartialAlgorithmConfigDict], +) -> Type[Union["TFPolicy", "EagerTFPolicy", "EagerTFPolicyV2"]]: + """Returns the corresponding tf-eager class for a given TFPolicy class. + + Args: + orig_cls: The original TFPolicy class to get the corresponding tf-eager + class for. + config: The Algorithm config dict or AlgorithmConfig object. + + Returns: + The tf eager policy class corresponding to the given TFPolicy class. + """ + cls = orig_cls + framework = config.get("framework", "tf") + + if framework in ["tf2", "tf"] and not tf1: + raise ImportError("Could not import tensorflow!") + + if framework == "tf2": + if not tf1.executing_eagerly(): + tf1.enable_eager_execution() + assert tf1.executing_eagerly() + + from ray.rllib.policy.tf_policy import TFPolicy + from ray.rllib.policy.eager_tf_policy import EagerTFPolicy + from ray.rllib.policy.eager_tf_policy_v2 import EagerTFPolicyV2 + + # Create eager-class (if not already one). + if hasattr(orig_cls, "as_eager") and not issubclass(orig_cls, EagerTFPolicy): + cls = orig_cls.as_eager() + # Could be some other type of policy or already + # eager-ized. + elif not issubclass(orig_cls, TFPolicy): + pass + else: + raise ValueError( + "This policy does not support eager execution: {}".format(orig_cls) + ) + + # Now that we know, policy is an eager one, add tracing, if necessary. + if config.get("eager_tracing") and issubclass( + cls, (EagerTFPolicy, EagerTFPolicyV2) + ): + cls = cls.with_tracing() + return cls + + +@PublicAPI +def huber_loss(x: TensorType, delta: float = 1.0) -> TensorType: + """Computes the huber loss for a given term and delta parameter. + + Reference: https://en.wikipedia.org/wiki/Huber_loss + Note that the factor of 0.5 is implicitly included in the calculation. + + Formula: + L = 0.5 * x^2 for small abs x (delta threshold) + L = delta * (abs(x) - 0.5*delta) for larger abs x (delta threshold) + + Args: + x: The input term, e.g. a TD error. + delta: The delta parmameter in the above formula. + + Returns: + The Huber loss resulting from `x` and `delta`. + """ + return tf.where( + tf.abs(x) < delta, # for small x -> apply the Huber correction + tf.math.square(x) * 0.5, + delta * (tf.abs(x) - 0.5 * delta), + ) + + +@PublicAPI +def l2_loss(x: TensorType) -> TensorType: + """Computes half the L2 norm over a tensor's values without the sqrt. + + output = 0.5 * sum(x ** 2) + + Args: + x: The input tensor. + + Returns: + 0.5 times the L2 norm over the given tensor's values (w/o sqrt). + """ + return 0.5 * tf.reduce_sum(tf.pow(x, 2.0)) + + +@PublicAPI +def make_tf_callable( + session_or_none: Optional["tf1.Session"], dynamic_shape: bool = False +) -> Callable: + """Returns a function that can be executed in either graph or eager mode. + + The function must take only positional args. + + If eager is enabled, this will act as just a function. Otherwise, it + will build a function that executes a session run with placeholders + internally. + + Args: + session_or_none: tf.Session if in graph mode, else None. + dynamic_shape: True if the placeholders should have a dynamic + batch dimension. Otherwise they will be fixed shape. + + Returns: + A function that can be called in either eager or static-graph mode. + """ + + if tf.executing_eagerly(): + assert session_or_none is None + else: + assert session_or_none is not None + + def make_wrapper(fn): + # Static-graph mode: Create placeholders and make a session call each + # time the wrapped function is called. Returns the output of this + # session call. + if session_or_none is not None: + args_placeholders = [] + kwargs_placeholders = {} + + symbolic_out = [None] + + def call(*args, **kwargs): + args_flat = [] + for a in args: + if type(a) is list: + args_flat.extend(a) + else: + args_flat.append(a) + args = args_flat + + # We have not built any placeholders yet: Do this once here, + # then reuse the same placeholders each time we call this + # function again. + if symbolic_out[0] is None: + with session_or_none.graph.as_default(): + + def _create_placeholders(path, value): + if dynamic_shape: + if len(value.shape) > 0: + shape = (None,) + value.shape[1:] + else: + shape = () + else: + shape = value.shape + return tf1.placeholder( + dtype=value.dtype, + shape=shape, + name=".".join([str(p) for p in path]), + ) + + placeholders = tree.map_structure_with_path( + _create_placeholders, args + ) + for ph in tree.flatten(placeholders): + args_placeholders.append(ph) + + placeholders = tree.map_structure_with_path( + _create_placeholders, kwargs + ) + for k, ph in placeholders.items(): + kwargs_placeholders[k] = ph + + symbolic_out[0] = fn(*args_placeholders, **kwargs_placeholders) + feed_dict = dict(zip(args_placeholders, tree.flatten(args))) + tree.map_structure( + lambda ph, v: feed_dict.__setitem__(ph, v), + kwargs_placeholders, + kwargs, + ) + ret = session_or_none.run(symbolic_out[0], feed_dict) + return ret + + return call + # Eager mode (call function as is). + else: + return fn + + return make_wrapper + + +# TODO (sven): Deprecate this function once we have moved completely to the Learner API. +# Replaced with `clip_gradients()`. +@PublicAPI +def minimize_and_clip( + optimizer: LocalOptimizer, + objective: TensorType, + var_list: List["tf.Variable"], + clip_val: float = 10.0, +) -> ModelGradients: + """Computes, then clips gradients using objective, optimizer and var list. + + Ensures the norm of the gradients for each variable is clipped to + `clip_val`. + + Args: + optimizer: Either a shim optimizer (tf eager) containing a + tf.GradientTape under `self.tape` or a tf1 local optimizer + object. + objective: The loss tensor to calculate gradients on. + var_list: The list of tf.Variables to compute gradients over. + clip_val: The global norm clip value. Will clip around -clip_val and + +clip_val. + + Returns: + The resulting model gradients (list or tuples of grads + vars) + corresponding to the input `var_list`. + """ + # Accidentally passing values < 0.0 will break all gradients. + assert clip_val is None or clip_val > 0.0, clip_val + + if tf.executing_eagerly(): + tape = optimizer.tape + grads_and_vars = list(zip(list(tape.gradient(objective, var_list)), var_list)) + else: + grads_and_vars = optimizer.compute_gradients(objective, var_list=var_list) + + return [ + (tf.clip_by_norm(g, clip_val) if clip_val is not None else g, v) + for (g, v) in grads_and_vars + if g is not None + ] + + +@PublicAPI +def one_hot(x: TensorType, space: gym.Space) -> TensorType: + """Returns a one-hot tensor, given and int tensor and a space. + + Handles the MultiDiscrete case as well. + + Args: + x: The input tensor. + space: The space to use for generating the one-hot tensor. + + Returns: + The resulting one-hot tensor. + + Raises: + ValueError: If the given space is not a discrete one. + + .. testcode:: + :skipif: True + + import gymnasium as gym + import tensorflow as tf + from ray.rllib.utils.tf_utils import one_hot + x = tf.Variable([0, 3], dtype=tf.int32) # batch-dim=2 + # Discrete space with 4 (one-hot) slots per batch item. + s = gym.spaces.Discrete(4) + one_hot(x, s) + + .. testoutput:: + + + + .. testcode:: + :skipif: True + + x = tf.Variable([[0, 1, 2, 3]], dtype=tf.int32) # batch-dim=1 + # MultiDiscrete space with 5 + 4 + 4 + 7 = 20 (one-hot) slots + # per batch item. + s = gym.spaces.MultiDiscrete([5, 4, 4, 7]) + one_hot(x, s) + + .. testoutput:: + + + """ + if isinstance(space, Discrete): + return tf.one_hot(x, space.n, dtype=tf.float32) + elif isinstance(space, MultiDiscrete): + if isinstance(space.nvec[0], np.ndarray): + nvec = np.ravel(space.nvec) + x = tf.reshape(x, (x.shape[0], -1)) + else: + nvec = space.nvec + return tf.concat( + [tf.one_hot(x[:, i], n, dtype=tf.float32) for i, n in enumerate(nvec)], + axis=-1, + ) + else: + raise ValueError("Unsupported space for `one_hot`: {}".format(space)) + + +@PublicAPI +def reduce_mean_ignore_inf(x: TensorType, axis: Optional[int] = None) -> TensorType: + """Same as tf.reduce_mean() but ignores -inf values. + + Args: + x: The input tensor to reduce mean over. + axis: The axis over which to reduce. None for all axes. + + Returns: + The mean reduced inputs, ignoring inf values. + """ + mask = tf.not_equal(x, tf.float32.min) + x_zeroed = tf.where(mask, x, tf.zeros_like(x)) + return tf.math.reduce_sum(x_zeroed, axis) / tf.math.reduce_sum( + tf.cast(mask, tf.float32), axis + ) + + +@PublicAPI +def scope_vars( + scope: Union[str, "tf1.VariableScope"], trainable_only: bool = False +) -> List["tf.Variable"]: + """Get variables inside a given scope. + + Args: + scope: Scope in which the variables reside. + trainable_only: Whether or not to return only the variables that were + marked as trainable. + + Returns: + The list of variables in the given `scope`. + """ + return tf1.get_collection( + tf1.GraphKeys.TRAINABLE_VARIABLES + if trainable_only + else tf1.GraphKeys.VARIABLES, + scope=scope if isinstance(scope, str) else scope.name, + ) + + +@PublicAPI +def symlog(x: "tf.Tensor") -> "tf.Tensor": + """The symlog function as described in [1]: + + [1] Mastering Diverse Domains through World Models - 2023 + D. Hafner, J. Pasukonis, J. Ba, T. Lillicrap + https://arxiv.org/pdf/2301.04104v1.pdf + """ + return tf.math.sign(x) * tf.math.log(tf.math.abs(x) + 1) + + +@PublicAPI +def inverse_symlog(y: "tf.Tensor") -> "tf.Tensor": + """Inverse of the `symlog` function as desribed in [1]: + + [1] Mastering Diverse Domains through World Models - 2023 + D. Hafner, J. Pasukonis, J. Ba, T. Lillicrap + https://arxiv.org/pdf/2301.04104v1.pdf + """ + # To get to symlog inverse, we solve the symlog equation for x: + # y = sign(x) * log(|x| + 1) + # <=> y / sign(x) = log(|x| + 1) + # <=> y = log( x + 1) V x >= 0 + # -y = log(-x + 1) V x < 0 + # <=> exp(y) = x + 1 V x >= 0 + # exp(-y) = -x + 1 V x < 0 + # <=> exp(y) - 1 = x V x >= 0 + # exp(-y) - 1 = -x V x < 0 + # <=> exp(y) - 1 = x V x >= 0 (if x >= 0, then y must also be >= 0) + # -exp(-y) - 1 = x V x < 0 (if x < 0, then y must also be < 0) + # <=> sign(y) * (exp(|y|) - 1) = x + return tf.math.sign(y) * (tf.math.exp(tf.math.abs(y)) - 1) + + +@PublicAPI +def two_hot( + value: "tf.Tensor", + num_buckets: int = 255, + lower_bound: float = -20.0, + upper_bound: float = 20.0, + dtype=None, +): + """Returns a two-hot vector of dim=num_buckets with two entries that are non-zero. + + See [1] for more details: + [1] Mastering Diverse Domains through World Models - 2023 + D. Hafner, J. Pasukonis, J. Ba, T. Lillicrap + https://arxiv.org/pdf/2301.04104v1.pdf + + Entries in the vector represent equally sized buckets within some fixed range + (`lower_bound` to `upper_bound`). + Those entries not 0.0 at positions k and k+1 encode the actual `value` and sum + up to 1.0. They are the weights multiplied by the buckets values at k and k+1 for + retrieving `value`. + + Example: + num_buckets=11 + lower_bound=-5 + upper_bound=5 + value=2.5 + -> [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.5, 0.5, 0.0, 0.0] + -> [-5 -4 -3 -2 -1 0 1 2 3 4 5] (0.5*2 + 0.5*3=2.5) + + Example: + num_buckets=5 + lower_bound=-1 + upper_bound=1 + value=0.1 + -> [0.0, 0.0, 0.8, 0.2, 0.0] + -> [-1 -0.5 0 0.5 1] (0.2*0.5 + 0.8*0=0.1) + + Args: + value: The input tensor of shape (B,) to be two-hot encoded. + num_buckets: The number of buckets to two-hot encode into. + lower_bound: The lower bound value used for the encoding. If input values are + lower than this boundary, they will be encoded as `lower_bound`. + upper_bound: The upper bound value used for the encoding. If input values are + higher than this boundary, they will be encoded as `upper_bound`. + + Returns: + The two-hot encoded tensor of shape (B, num_buckets). + """ + # First make sure, values are clipped. + value = tf.clip_by_value(value, lower_bound, upper_bound) + # Tensor of batch indices: [0, B=batch size). + batch_indices = tf.cast( + tf.range(0, tf.shape(value)[0]), + dtype=dtype or tf.float32, + ) + # Calculate the step deltas (how much space between each bucket's central value?). + bucket_delta = (upper_bound - lower_bound) / (num_buckets - 1) + # Compute the float indices (might be non-int numbers: sitting between two buckets). + idx = (-lower_bound + value) / bucket_delta + # k + k = tf.math.floor(idx) + # k+1 + kp1 = tf.math.ceil(idx) + # In case k == kp1 (idx is exactly on the bucket boundary), move kp1 up by 1.0. + # Otherwise, this would result in a NaN in the returned two-hot tensor. + kp1 = tf.where(tf.equal(k, kp1), kp1 + 1.0, kp1) + # Iff `kp1` is one beyond our last index (because incoming value is larger than + # `upper_bound`), move it to one before k (kp1's weight is going to be 0.0 anyways, + # so it doesn't matter where it points to; we are just avoiding an index error + # with this). + kp1 = tf.where(tf.equal(kp1, num_buckets), kp1 - 2.0, kp1) + # The actual values found at k and k+1 inside the set of buckets. + values_k = lower_bound + k * bucket_delta + values_kp1 = lower_bound + kp1 * bucket_delta + # Compute the two-hot weights (adding up to 1.0) to use at index k and k+1. + weights_k = (value - values_kp1) / (values_k - values_kp1) + weights_kp1 = 1.0 - weights_k + # Compile a tensor of full paths (indices from batch index to feature index) to + # use for the scatter_nd op. + indices_k = tf.stack([batch_indices, k], -1) + indices_kp1 = tf.stack([batch_indices, kp1], -1) + indices = tf.concat([indices_k, indices_kp1], 0) + # The actual values (weights adding up to 1.0) to place at the computed indices. + updates = tf.concat([weights_k, weights_kp1], 0) + # Call the actual scatter update op, returning a zero-filled tensor, only changed + # at the given indices. + return tf.scatter_nd( + tf.cast(indices, tf.int32), + updates, + shape=(tf.shape(value)[0], num_buckets), + ) + + +@PublicAPI +def update_target_network( + main_net: NetworkType, + target_net: NetworkType, + tau: float, +) -> None: + """Updates a keras.Model target network using Polyak averaging. + + new_target_net_weight = ( + tau * main_net_weight + (1.0 - tau) * current_target_net_weight + ) + + Args: + main_net: The keras.Model to update from. + target_net: The target network to update. + tau: The tau value to use in the Polyak averaging formula. + """ + for old_var, current_var in zip(target_net.variables, main_net.variables): + updated_var = tau * current_var + (1.0 - tau) * old_var + old_var.assign(updated_var) + + +@PublicAPI +def zero_logps_from_actions(actions: TensorStructType) -> TensorType: + """Helper function useful for returning dummy logp's (0) for some actions. + + Args: + actions: The input actions. This can be any struct + of complex action components or a simple tensor of different + dimensions, e.g. [B], [B, 2], or {"a": [B, 4, 5], "b": [B]}. + + Returns: + A 1D tensor of 0.0 (dummy logp's) matching the batch + dim of `actions` (shape=[B]). + """ + # Need to flatten `actions` in case we have a complex action space. + # Take the 0th component to extract the batch dim. + action_component = tree.flatten(actions)[0] + logp_ = tf.zeros_like(action_component, dtype=tf.float32) + # Logp's should be single values (but with the same batch dim as + # `deterministic_actions` or `stochastic_actions`). In case + # actions are just [B], zeros_like works just fine here, but if + # actions are [B, ...], we have to reduce logp back to just [B]. + while len(logp_.shape) > 1: + logp_ = logp_[:, 0] + return logp_ + + +@DeveloperAPI +def warn_if_infinite_kl_divergence( + policy: Type["TFPolicy"], mean_kl: TensorType +) -> None: + def print_warning(): + logger.warning( + "KL divergence is non-finite, this will likely destabilize your model and" + " the training process. Action(s) in a specific state have near-zero" + " probability. This can happen naturally in deterministic environments" + " where the optimal policy has zero mass for a specific action. To fix this" + " issue, consider setting the coefficient for the KL loss term to zero or" + " increasing policy entropy." + ) + return tf.constant(0.0) + + if policy.loss_initialized(): + tf.cond( + tf.math.is_inf(mean_kl), + false_fn=lambda: tf.constant(0.0), + true_fn=lambda: print_warning(), + ) diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/utils/threading.py b/.venv/lib/python3.11/site-packages/ray/rllib/utils/threading.py new file mode 100644 index 0000000000000000000000000000000000000000..a9a4461dadbf69afa24ed8e9007fe326640dc7de --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/rllib/utils/threading.py @@ -0,0 +1,34 @@ +from typing import Callable + +from ray.rllib.utils.annotations import OldAPIStack + + +@OldAPIStack +def with_lock(func: Callable) -> Callable: + """Use as decorator (@withlock) around object methods that need locking. + + Note: The object must have a self._lock = threading.Lock() property. + Locking thus works on the object level (no two locked methods of the same + object can be called asynchronously). + + Args: + func: The function to decorate/wrap. + + Returns: + The wrapped (object-level locked) function. + """ + + def wrapper(self, *a, **k): + try: + with self._lock: + return func(self, *a, **k) + except AttributeError as e: + if "has no attribute '_lock'" in e.args[0]: + raise AttributeError( + "Object {} must have a `self._lock` property (assigned " + "to a threading.RLock() object in its " + "constructor)!".format(self) + ) + raise e + + return wrapper diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/utils/torch_utils.py b/.venv/lib/python3.11/site-packages/ray/rllib/utils/torch_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..e3783d0583c5f9a2453dfc517e462c5d3eb503cd --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/rllib/utils/torch_utils.py @@ -0,0 +1,726 @@ +import logging +import os +import warnings +from typing import Dict, List, Optional, TYPE_CHECKING, Union + +import gymnasium as gym +from gymnasium.spaces import Discrete, MultiDiscrete +import numpy as np +from packaging import version +import tree # pip install dm_tree + +from ray.rllib.models.repeated_values import RepeatedValues +from ray.rllib.utils.annotations import PublicAPI, DeveloperAPI, OldAPIStack +from ray.rllib.utils.framework import try_import_torch +from ray.rllib.utils.numpy import SMALL_NUMBER +from ray.rllib.utils.typing import ( + LocalOptimizer, + NetworkType, + SpaceStruct, + TensorStructType, + TensorType, +) + +if TYPE_CHECKING: + from ray.rllib.core.learner.learner import ParamDict, ParamList + from ray.rllib.policy.torch_policy import TorchPolicy + from ray.rllib.policy.torch_policy_v2 import TorchPolicyV2 + +logger = logging.getLogger(__name__) +torch, nn = try_import_torch() + +# Limit values suitable for use as close to a -inf logit. These are useful +# since -inf / inf cause NaNs during backprop. +FLOAT_MIN = -3.4e38 +FLOAT_MAX = 3.4e38 + +if torch: + TORCH_COMPILE_REQUIRED_VERSION = version.parse("2.0.0") +else: + TORCH_COMPILE_REQUIRED_VERSION = ValueError( + "torch is not installed. " "TORCH_COMPILE_REQUIRED_VERSION is " "not defined." + ) + + +@OldAPIStack +def apply_grad_clipping( + policy: "TorchPolicy", optimizer: LocalOptimizer, loss: TensorType +) -> Dict[str, TensorType]: + """Applies gradient clipping to already computed grads inside `optimizer`. + + Note: This function does NOT perform an analogous operation as + tf.clip_by_global_norm. It merely clips by norm (per gradient tensor) and + then computes the global norm across all given tensors (but without clipping + by that global norm). + + Args: + policy: The TorchPolicy, which calculated `loss`. + optimizer: A local torch optimizer object. + loss: The torch loss tensor. + + Returns: + An info dict containing the "grad_norm" key and the resulting clipped + gradients. + """ + grad_gnorm = 0 + if policy.config["grad_clip"] is not None: + clip_value = policy.config["grad_clip"] + else: + clip_value = np.inf + + num_none_grads = 0 + for param_group in optimizer.param_groups: + # Make sure we only pass params with grad != None into torch + # clip_grad_norm_. Would fail otherwise. + params = list(filter(lambda p: p.grad is not None, param_group["params"])) + if params: + # PyTorch clips gradients inplace and returns the norm before clipping + # We therefore need to compute grad_gnorm further down (fixes #4965) + global_norm = nn.utils.clip_grad_norm_(params, clip_value) + + if isinstance(global_norm, torch.Tensor): + global_norm = global_norm.cpu().numpy() + + grad_gnorm += min(global_norm, clip_value) + else: + num_none_grads += 1 + + # Note (Kourosh): grads could indeed be zero. This method should still return + # grad_gnorm in that case. + if num_none_grads == len(optimizer.param_groups): + # No grads available + return {} + return {"grad_gnorm": grad_gnorm} + + +@PublicAPI +def clip_gradients( + gradients_dict: "ParamDict", + *, + grad_clip: Optional[float] = None, + grad_clip_by: str = "value", +) -> TensorType: + """Performs gradient clipping on a grad-dict based on a clip value and clip mode. + + Changes the provided gradient dict in place. + + Args: + gradients_dict: The gradients dict, mapping str to gradient tensors. + grad_clip: The value to clip with. The way gradients are clipped is defined + by the `grad_clip_by` arg (see below). + grad_clip_by: One of 'value', 'norm', or 'global_norm'. + + Returns: + If `grad_clip_by`="global_norm" and `grad_clip` is not None, returns the global + norm of all tensors, otherwise returns None. + """ + # No clipping, return. + if grad_clip is None: + return + + # Clip by value (each gradient individually). + if grad_clip_by == "value": + for k, v in gradients_dict.copy().items(): + gradients_dict[k] = ( + None if v is None else torch.clip(v, -grad_clip, grad_clip) + ) + + # Clip by L2-norm (per gradient tensor). + elif grad_clip_by == "norm": + for k, v in gradients_dict.copy().items(): + if v is not None: + # Compute the L2-norm of the gradient tensor. + norm = v.norm(2).nan_to_num(neginf=-10e8, posinf=10e8) + # Clip all the gradients. + if norm > grad_clip: + v.mul_(grad_clip / norm) + + # Clip by global L2-norm (across all gradient tensors). + else: + assert ( + grad_clip_by == "global_norm" + ), f"`grad_clip_by` ({grad_clip_by}) must be one of [value|norm|global_norm]!" + gradients_list = list(gradients_dict.values()) + total_norm = compute_global_norm(gradients_list) + if len(gradients_list) == 0: + return total_norm + # We do want the coefficient to be in between 0.0 and 1.0, therefore + # if the global_norm is smaller than the clip value, we use the clip value + # as normalization constant. + device = gradients_list[0].device + clip_coef = grad_clip / torch.maximum( + torch.tensor(grad_clip).to(device), total_norm + 1e-6 + ) + # Note: multiplying by the clamped coef is redundant when the coef is clamped to + # 1, but doing so avoids a `if clip_coef < 1:` conditional which can require a + # CPU <=> device synchronization when the gradients do not reside in CPU memory. + clip_coef_clamped = torch.clamp(clip_coef, max=1.0) + for g in gradients_list: + if g is not None: + g.detach().mul_(clip_coef_clamped.to(g.device)) + return total_norm + + +@PublicAPI +def compute_global_norm(gradients_list: "ParamList") -> TensorType: + """Computes the global norm for a gradients dict. + + Args: + gradients_list: The gradients list containing parameters. + + Returns: + Returns the global norm of all tensors in `gradients_list`. + """ + # Define the norm type to be L2. + norm_type = 2.0 + # If we have no grads, return zero. + if len(gradients_list) == 0: + return torch.tensor(0.0) + device = gradients_list[0].device + + # Compute the global norm. + total_norm = torch.norm( + torch.stack( + [ + torch.norm(g.detach(), norm_type) + # Note, we want to avoid overflow in the norm computation, this does + # not affect the gradients themselves as we clamp by multiplying and + # not by overriding tensor values. + .nan_to_num(neginf=-10e8, posinf=10e8).to(device) + for g in gradients_list + if g is not None + ] + ), + norm_type, + ).nan_to_num(neginf=-10e8, posinf=10e8) + if torch.logical_or(total_norm.isnan(), total_norm.isinf()): + raise RuntimeError( + f"The total norm of order {norm_type} for gradients from " + "`parameters` is non-finite, so it cannot be clipped. " + ) + # Return the global norm. + return total_norm + + +@OldAPIStack +def concat_multi_gpu_td_errors( + policy: Union["TorchPolicy", "TorchPolicyV2"] +) -> Dict[str, TensorType]: + """Concatenates multi-GPU (per-tower) TD error tensors given TorchPolicy. + + TD-errors are extracted from the TorchPolicy via its tower_stats property. + + Args: + policy: The TorchPolicy to extract the TD-error values from. + + Returns: + A dict mapping strings "td_error" and "mean_td_error" to the + corresponding concatenated and mean-reduced values. + """ + td_error = torch.cat( + [ + t.tower_stats.get("td_error", torch.tensor([0.0])).to(policy.device) + for t in policy.model_gpu_towers + ], + dim=0, + ) + policy.td_error = td_error + return { + "td_error": td_error, + "mean_td_error": torch.mean(td_error), + } + + +@PublicAPI +def convert_to_torch_tensor( + x: TensorStructType, + device: Optional[str] = None, + pin_memory: bool = False, +): + """Converts any struct to torch.Tensors. + + Args: + x: Any (possibly nested) struct, the values in which will be + converted and returned as a new struct with all leaves converted + to torch tensors. + device: The device to create the tensor on. + pin_memory: If True, will call the `pin_memory()` method on the created tensors. + + Returns: + Any: A new struct with the same structure as `x`, but with all + values converted to torch Tensor types. This does not convert possibly + nested elements that are None because torch has no representation for that. + """ + + def mapping(item): + if item is None: + # Torch has no representation for `None`, so we return None + return item + + # Special handling of "Repeated" values. + if isinstance(item, RepeatedValues): + return RepeatedValues( + tree.map_structure(mapping, item.values), item.lengths, item.max_len + ) + + # Already torch tensor -> make sure it's on right device. + if torch.is_tensor(item): + tensor = item + # Numpy arrays. + elif isinstance(item, np.ndarray): + # Object type (e.g. info dicts in train batch): leave as-is. + # str type (e.g. agent_id in train batch): leave as-is. + if item.dtype == object or item.dtype.type is np.str_: + return item + # Non-writable numpy-arrays will cause PyTorch warning. + elif item.flags.writeable is False: + with warnings.catch_warnings(): + warnings.simplefilter("ignore") + tensor = torch.from_numpy(item) + # Already numpy: Wrap as torch tensor. + else: + tensor = torch.from_numpy(item) + # Everything else: Convert to numpy, then wrap as torch tensor. + else: + tensor = torch.from_numpy(np.asarray(item)) + + # Floatify all float64 tensors (but leave float16 as-is). + if tensor.is_floating_point() and str(tensor.dtype) != "torch.float16": + tensor = tensor.float() + + # Pin the tensor's memory (for faster transfer to GPU later). + if pin_memory and torch.cuda.is_available(): + tensor.pin_memory() + + return tensor if device is None else tensor.to(device) + + return tree.map_structure(mapping, x) + + +@PublicAPI +def copy_torch_tensors(x: TensorStructType, device: Optional[str] = None): + """Creates a copy of `x` and makes deep copies torch.Tensors in x. + + Also moves the copied tensors to the specified device (if not None). + + Note if an object in x is not a torch.Tensor, it will be shallow-copied. + + Args: + x : Any (possibly nested) struct possibly containing torch.Tensors. + device : The device to move the tensors to. + + Returns: + Any: A new struct with the same structure as `x`, but with all + torch.Tensors deep-copied and moved to the specified device. + + """ + + def mapping(item): + if isinstance(item, torch.Tensor): + return ( + torch.clone(item.detach()) + if device is None + else item.detach().to(device) + ) + else: + return item + + return tree.map_structure(mapping, x) + + +@PublicAPI +def explained_variance(y: TensorType, pred: TensorType) -> TensorType: + """Computes the explained variance for a pair of labels and predictions. + + The formula used is: + max(-1.0, 1.0 - (std(y - pred)^2 / std(y)^2)) + + Args: + y: The labels. + pred: The predictions. + + Returns: + The explained variance given a pair of labels and predictions. + """ + y_var = torch.var(y, dim=[0]) + diff_var = torch.var(y - pred, dim=[0]) + min_ = torch.tensor([-1.0]).to(pred.device) + return torch.max(min_, 1 - (diff_var / (y_var + SMALL_NUMBER)))[0] + + +@PublicAPI +def flatten_inputs_to_1d_tensor( + inputs: TensorStructType, + spaces_struct: Optional[SpaceStruct] = None, + time_axis: bool = False, +) -> TensorType: + """Flattens arbitrary input structs according to the given spaces struct. + + Returns a single 1D tensor resulting from the different input + components' values. + + Thereby: + - Boxes (any shape) get flattened to (B, [T]?, -1). Note that image boxes + are not treated differently from other types of Boxes and get + flattened as well. + - Discrete (int) values are one-hot'd, e.g. a batch of [1, 0, 3] (B=3 with + Discrete(4) space) results in [[0, 1, 0, 0], [1, 0, 0, 0], [0, 0, 0, 1]]. + - MultiDiscrete values are multi-one-hot'd, e.g. a batch of + [[0, 2], [1, 4]] (B=2 with MultiDiscrete([2, 5]) space) results in + [[1, 0, 0, 0, 1, 0, 0], [0, 1, 0, 0, 0, 0, 1]]. + + Args: + inputs: The inputs to be flattened. + spaces_struct: The structure of the spaces that behind the input + time_axis: Whether all inputs have a time-axis (after the batch axis). + If True, will keep not only the batch axis (0th), but the time axis + (1st) as-is and flatten everything from the 2nd axis up. + + Returns: + A single 1D tensor resulting from concatenating all + flattened/one-hot'd input components. Depending on the time_axis flag, + the shape is (B, n) or (B, T, n). + + .. testcode:: + + from gymnasium.spaces import Discrete, Box + from ray.rllib.utils.torch_utils import flatten_inputs_to_1d_tensor + import torch + struct = { + "a": np.array([1, 3]), + "b": ( + np.array([[1.0, 2.0], [4.0, 5.0]]), + np.array( + [[[8.0], [7.0]], [[5.0], [4.0]]] + ), + ), + "c": { + "cb": np.array([1.0, 2.0]), + }, + } + struct_torch = tree.map_structure(lambda s: torch.from_numpy(s), struct) + spaces = dict( + { + "a": gym.spaces.Discrete(4), + "b": (gym.spaces.Box(-1.0, 10.0, (2,)), gym.spaces.Box(-1.0, 1.0, (2, + 1))), + "c": dict( + { + "cb": gym.spaces.Box(-1.0, 1.0, ()), + } + ), + } + ) + print(flatten_inputs_to_1d_tensor(struct_torch, spaces_struct=spaces)) + + .. testoutput:: + + tensor([[0., 1., 0., 0., 1., 2., 8., 7., 1.], + [0., 0., 0., 1., 4., 5., 5., 4., 2.]]) + + """ + + flat_inputs = tree.flatten(inputs) + flat_spaces = ( + tree.flatten(spaces_struct) + if spaces_struct is not None + else [None] * len(flat_inputs) + ) + + B = None + T = None + out = [] + for input_, space in zip(flat_inputs, flat_spaces): + # Store batch and (if applicable) time dimension. + if B is None: + B = input_.shape[0] + if time_axis: + T = input_.shape[1] + + # One-hot encoding. + if isinstance(space, Discrete): + if time_axis: + input_ = torch.reshape(input_, [B * T]) + out.append(one_hot(input_, space).float()) + # Multi one-hot encoding. + elif isinstance(space, MultiDiscrete): + if time_axis: + input_ = torch.reshape(input_, [B * T, -1]) + out.append(one_hot(input_, space).float()) + # Box: Flatten. + else: + if time_axis: + input_ = torch.reshape(input_, [B * T, -1]) + else: + input_ = torch.reshape(input_, [B, -1]) + out.append(input_.float()) + + merged = torch.cat(out, dim=-1) + # Restore the time-dimension, if applicable. + if time_axis: + merged = torch.reshape(merged, [B, T, -1]) + + return merged + + +@PublicAPI +def global_norm(tensors: List[TensorType]) -> TensorType: + """Returns the global L2 norm over a list of tensors. + + output = sqrt(SUM(t ** 2 for t in tensors)), + where SUM reduces over all tensors and over all elements in tensors. + + Args: + tensors: The list of tensors to calculate the global norm over. + + Returns: + The global L2 norm over the given tensor list. + """ + # List of single tensors' L2 norms: SQRT(SUM(xi^2)) over all xi in tensor. + single_l2s = [torch.pow(torch.sum(torch.pow(t, 2.0)), 0.5) for t in tensors] + # Compute global norm from all single tensors' L2 norms. + return torch.pow(sum(torch.pow(l2, 2.0) for l2 in single_l2s), 0.5) + + +@OldAPIStack +def huber_loss(x: TensorType, delta: float = 1.0) -> TensorType: + """Computes the huber loss for a given term and delta parameter. + + Reference: https://en.wikipedia.org/wiki/Huber_loss + Note that the factor of 0.5 is implicitly included in the calculation. + + Formula: + L = 0.5 * x^2 for small abs x (delta threshold) + L = delta * (abs(x) - 0.5*delta) for larger abs x (delta threshold) + + Args: + x: The input term, e.g. a TD error. + delta: The delta parmameter in the above formula. + + Returns: + The Huber loss resulting from `x` and `delta`. + """ + return torch.where( + torch.abs(x) < delta, + torch.pow(x, 2.0) * 0.5, + delta * (torch.abs(x) - 0.5 * delta), + ) + + +@OldAPIStack +def l2_loss(x: TensorType) -> TensorType: + """Computes half the L2 norm over a tensor's values without the sqrt. + + output = 0.5 * sum(x ** 2) + + Args: + x: The input tensor. + + Returns: + 0.5 times the L2 norm over the given tensor's values (w/o sqrt). + """ + return 0.5 * torch.sum(torch.pow(x, 2.0)) + + +@PublicAPI +def one_hot(x: TensorType, space: gym.Space) -> TensorType: + """Returns a one-hot tensor, given and int tensor and a space. + + Handles the MultiDiscrete case as well. + + Args: + x: The input tensor. + space: The space to use for generating the one-hot tensor. + + Returns: + The resulting one-hot tensor. + + Raises: + ValueError: If the given space is not a discrete one. + + .. testcode:: + + import torch + import gymnasium as gym + from ray.rllib.utils.torch_utils import one_hot + x = torch.IntTensor([0, 3]) # batch-dim=2 + # Discrete space with 4 (one-hot) slots per batch item. + s = gym.spaces.Discrete(4) + print(one_hot(x, s)) + x = torch.IntTensor([[0, 1, 2, 3]]) # batch-dim=1 + # MultiDiscrete space with 5 + 4 + 4 + 7 = 20 (one-hot) slots + # per batch item. + s = gym.spaces.MultiDiscrete([5, 4, 4, 7]) + print(one_hot(x, s)) + + .. testoutput:: + + tensor([[1, 0, 0, 0], + [0, 0, 0, 1]]) + tensor([[1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0]]) + """ + if isinstance(space, Discrete): + return nn.functional.one_hot(x.long(), space.n) + elif isinstance(space, MultiDiscrete): + if isinstance(space.nvec[0], np.ndarray): + nvec = np.ravel(space.nvec) + x = x.reshape(x.shape[0], -1) + else: + nvec = space.nvec + return torch.cat( + [nn.functional.one_hot(x[:, i].long(), n) for i, n in enumerate(nvec)], + dim=-1, + ) + else: + raise ValueError("Unsupported space for `one_hot`: {}".format(space)) + + +@PublicAPI +def reduce_mean_ignore_inf(x: TensorType, axis: Optional[int] = None) -> TensorType: + """Same as torch.mean() but ignores -inf values. + + Args: + x: The input tensor to reduce mean over. + axis: The axis over which to reduce. None for all axes. + + Returns: + The mean reduced inputs, ignoring inf values. + """ + mask = torch.ne(x, float("-inf")) + x_zeroed = torch.where(mask, x, torch.zeros_like(x)) + return torch.sum(x_zeroed, axis) / torch.sum(mask.float(), axis) + + +@PublicAPI +def sequence_mask( + lengths: TensorType, + maxlen: Optional[int] = None, + dtype=None, + time_major: bool = False, +) -> TensorType: + """Offers same behavior as tf.sequence_mask for torch. + + Thanks to Dimitris Papatheodorou + (https://discuss.pytorch.org/t/pytorch-equivalent-for-tf-sequence-mask/ + 39036). + + Args: + lengths: The tensor of individual lengths to mask by. + maxlen: The maximum length to use for the time axis. If None, use + the max of `lengths`. + dtype: The torch dtype to use for the resulting mask. + time_major: Whether to return the mask as [B, T] (False; default) or + as [T, B] (True). + + Returns: + The sequence mask resulting from the given input and parameters. + """ + # If maxlen not given, use the longest lengths in the `lengths` tensor. + if maxlen is None: + maxlen = lengths.max() + + mask = torch.ones(tuple(lengths.shape) + (int(maxlen),)) + + mask = ~(mask.to(lengths.device).cumsum(dim=1).t() > lengths) + # Time major transformation. + if not time_major: + mask = mask.t() + + # By default, set the mask to be boolean. + mask.type(dtype or torch.bool) + + return mask + + +@PublicAPI +def update_target_network( + main_net: NetworkType, + target_net: NetworkType, + tau: float, +) -> None: + """Updates a torch.nn.Module target network using Polyak averaging. + + .. code-block:: text + + new_target_net_weight = ( + tau * main_net_weight + (1.0 - tau) * current_target_net_weight + ) + + Args: + main_net: The nn.Module to update from. + target_net: The target network to update. + tau: The tau value to use in the Polyak averaging formula. + """ + # Get the current parameters from the Q network. + state_dict = main_net.state_dict() + # Use here Polyak averaging. + new_state_dict = { + k: tau * state_dict[k] + (1 - tau) * v + for k, v in target_net.state_dict().items() + } + # Apply the new parameters to the target Q network. + target_net.load_state_dict(new_state_dict) + + +@DeveloperAPI +def warn_if_infinite_kl_divergence( + policy: "TorchPolicy", + kl_divergence: TensorType, +) -> None: + if policy.loss_initialized() and kl_divergence.isinf(): + logger.warning( + "KL divergence is non-finite, this will likely destabilize your model and" + " the training process. Action(s) in a specific state have near-zero" + " probability. This can happen naturally in deterministic environments" + " where the optimal policy has zero mass for a specific action. To fix this" + " issue, consider setting the coefficient for the KL loss term to zero or" + " increasing policy entropy." + ) + + +@PublicAPI +def set_torch_seed(seed: Optional[int] = None) -> None: + """Sets the torch random seed to the given value. + + Args: + seed: The seed to use or None for no seeding. + """ + if seed is not None and torch: + torch.manual_seed(seed) + # See https://github.com/pytorch/pytorch/issues/47672. + cuda_version = torch.version.cuda + if cuda_version is not None and float(torch.version.cuda) >= 10.2: + os.environ["CUBLAS_WORKSPACE_CONFIG"] = "4096:8" + else: + # Not all Operations support this. + torch.use_deterministic_algorithms(True) + # This is only for Convolution no problem. + torch.backends.cudnn.deterministic = True + + +@PublicAPI +def softmax_cross_entropy_with_logits( + logits: TensorType, + labels: TensorType, +) -> TensorType: + """Same behavior as tf.nn.softmax_cross_entropy_with_logits. + + Args: + x: The input predictions. + labels: The labels corresponding to `x`. + + Returns: + The resulting softmax cross-entropy given predictions and labels. + """ + return torch.sum(-labels * nn.functional.log_softmax(logits, -1), -1) + + +def _dynamo_is_available(): + # This only works if torch._dynamo is available + try: + # TODO(Artur): Remove this once torch._dynamo is available on CI + import torch._dynamo as dynamo # noqa: F401 + + return True + except ImportError: + return False diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/utils/typing.py b/.venv/lib/python3.11/site-packages/ray/rllib/utils/typing.py new file mode 100644 index 0000000000000000000000000000000000000000..1b4cd2f41f0078e49bdee33594f4632b99f654da --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/rllib/utils/typing.py @@ -0,0 +1,310 @@ +from typing import ( + TYPE_CHECKING, + Any, + Callable, + Dict, + Hashable, + List, + Optional, + Sequence, + Tuple, + Type, + TypeVar, + Union, +) + +import numpy as np +import gymnasium as gym + +from ray.rllib.utils.annotations import OldAPIStack + +if TYPE_CHECKING: + from ray.rllib.core.rl_module.rl_module import RLModuleSpec + from ray.rllib.core.rl_module.multi_rl_module import MultiRLModuleSpec + from ray.rllib.env.env_context import EnvContext + from ray.rllib.env.multi_agent_episode import MultiAgentEpisode + from ray.rllib.env.single_agent_episode import SingleAgentEpisode + from ray.rllib.policy.dynamic_tf_policy_v2 import DynamicTFPolicyV2 + from ray.rllib.policy.eager_tf_policy_v2 import EagerTFPolicyV2 + from ray.rllib.policy.policy import PolicySpec + from ray.rllib.policy.sample_batch import MultiAgentBatch, SampleBatch + from ray.rllib.policy.view_requirement import ViewRequirement + from ray.rllib.utils import try_import_jax, try_import_tf, try_import_torch + + _, tf, _ = try_import_tf() + torch, _ = try_import_torch() + jax, _ = try_import_jax() + jnp = None + if jax is not None: + jnp = jax.numpy + +# Represents a generic tensor type. +# This could be an np.ndarray, tf.Tensor, or a torch.Tensor. +TensorType = Union[np.array, "jnp.ndarray", "tf.Tensor", "torch.Tensor"] + +# Either a plain tensor, or a dict or tuple of tensors (or StructTensors). +TensorStructType = Union[TensorType, dict, tuple] + +# A shape of a tensor. +TensorShape = Union[Tuple[int], List[int]] + +# A neural network +NetworkType = Union["torch.nn.Module", "tf.keras.Model"] + +# An RLModule spec (single-agent or multi-agent). +RLModuleSpecType = Union["RLModuleSpec", "MultiRLModuleSpec"] + +# A state dict of an RLlib component (e.g. EnvRunner, Learner, RLModule). +StateDict = Dict[str, Any] + +# Represents a fully filled out config of a Algorithm class. +# Note: Policy config dicts are usually the same as AlgorithmConfigDict, but +# parts of it may sometimes be altered in e.g. a multi-agent setup, +# where we have >1 Policies in the same Algorithm. +AlgorithmConfigDict = dict # @OldAPIStack + +# An algorithm config dict that only has overrides. It needs to be combined with +# the default algorithm config to be used. +PartialAlgorithmConfigDict = dict # @OldAPIStack + +# Represents the model config sub-dict of the algo config that is passed to +# the model catalog. +ModelConfigDict = dict # @OldAPIStack + +# Conv2D configuration format. +# Each entry in the outer list represents one Conv2D layer. +# Each inner list has the format: [num_output_filters, kernel, stride], where kernel +# and stride may be single ints (width and height are the same) or 2-tuples (int, int) +# for width and height (different values). +ConvFilterSpec = List[ + Tuple[int, Union[int, Tuple[int, int]], Union[int, Tuple[int, int]]] +] + +# Objects that can be created through the `from_config()` util method +# need a config dict with a "type" key, a class path (str), or a type directly. +FromConfigSpec = Union[Dict[str, Any], type, str] + +# Represents the env_config sub-dict of the algo config that is passed to +# the env constructor. +EnvConfigDict = dict + +# Represents an environment id. These could be: +# - An int index for a sub-env within a vectorized env. +# - An external env ID (str), which changes(!) each episode. +EnvID = Union[int, str] + +# Represents a BaseEnv, MultiAgentEnv, ExternalEnv, ExternalMultiAgentEnv, +# VectorEnv, gym.Env, or ActorHandle. +# TODO (sven): Specify this type more strictly (it should just be gym.Env). +EnvType = Union[Any, gym.Env] + +# A callable, taking a EnvContext object +# (config dict + properties: `worker_index`, `vector_index`, `num_workers`, +# and `remote`) and returning an env object (or None if no env is used). +EnvCreator = Callable[["EnvContext"], Optional[EnvType]] + +# Represents a generic identifier for an agent (e.g., "agent1"). +AgentID = Any + +# Represents a generic identifier for a policy (e.g., "pol1"). +PolicyID = str # @OldAPIStack +# Represents a generic identifier for a (single-agent) RLModule. +ModuleID = str + +# Type of the config.policies dict for multi-agent training. +MultiAgentPolicyConfigDict = Dict[PolicyID, "PolicySpec"] # @OldAPIStack + +# A new stack Episode type: Either single-agent or multi-agent. +EpisodeType = Union["SingleAgentEpisode", "MultiAgentEpisode"] + +# Is Policy to train callable. +# @OldAPIStack +IsPolicyToTrain = Callable[[PolicyID, Optional["MultiAgentBatch"]], bool] + +# Agent to module mapping and should-module-be-updated. +AgentToModuleMappingFn = Callable[[AgentID, EpisodeType], ModuleID] +ShouldModuleBeUpdatedFn = Union[ + Sequence[ModuleID], + Callable[[ModuleID, Optional["MultiAgentBatch"]], bool], +] + +# State dict of a Policy, mapping strings (e.g. "weights") to some state +# data (TensorStructType). +PolicyState = Dict[str, TensorStructType] # @OldAPIStack + +# Any tf Policy type (static-graph or eager Policy). +TFPolicyV2Type = Type[Union["DynamicTFPolicyV2", "EagerTFPolicyV2"]] # @OldAPIStack + +# Represents an episode id (old and new API stack). +EpisodeID = Union[int, str] + +# Represents an "unroll" (maybe across different sub-envs in a vector env). +UnrollID = int # @OldAPIStack + +# A dict keyed by agent ids, e.g. {"agent-1": value}. +MultiAgentDict = Dict[AgentID, Any] + +# A dict keyed by env ids that contain further nested dictionaries keyed by +# agent ids. e.g., {"env-1": {"agent-1": value}}. +MultiEnvDict = Dict[EnvID, MultiAgentDict] + +# Represents an observation returned from the env. +EnvObsType = Any + +# Represents an action passed to the env. +EnvActionType = Any + +# Info dictionary returned by calling `reset()` or `step()` on `gymnasium.Env` +# instances. Might be an empty dict. +EnvInfoDict = dict + +# Represents a File object +FileType = Any + +# Represents a ViewRequirements dict mapping column names (str) to +# ViewRequirement objects. +ViewRequirementsDict = Dict[str, "ViewRequirement"] # @OldAPIStack + +# Represents the result dict returned by Algorithm.train() and algorithm components, +# such as EnvRunners, LearnerGroup, etc.. Also, the MetricsLogger used by all these +# components returns this upon its `reduce()` method call, so a ResultDict can further +# be accumulated (and reduced again) by downstream components. +ResultDict = Dict + +# A tf or torch local optimizer object. +LocalOptimizer = Union["torch.optim.Optimizer", "tf.keras.optimizers.Optimizer"] +Optimizer = LocalOptimizer +Param = Union["torch.Tensor", "tf.Variable"] +ParamRef = Hashable +ParamDict = Dict[ParamRef, Param] +ParamList = List[Param] + +# A single learning rate or a learning rate schedule (list of sub-lists, each of +# the format: [ts (int), lr_to_reach_by_ts (float)]). +LearningRateOrSchedule = Union[ + float, + List[List[Union[int, float]]], + List[Tuple[int, Union[int, float]]], +] + +# Dict of tensors returned by compute gradients on the policy, e.g., +# {"td_error": [...], "learner_stats": {"vf_loss": ..., ...}}, for multi-agent, +# {"policy1": {"learner_stats": ..., }, "policy2": ...}. +GradInfoDict = dict + +# Dict of learner stats returned by compute gradients on the policy, e.g., +# {"vf_loss": ..., ...}. This will always be nested under the "learner_stats" +# key(s) of a GradInfoDict. In the multi-agent case, this will be keyed by +# policy id. +LearnerStatsDict = dict + +# List of grads+var tuples (tf) or list of gradient tensors (torch) +# representing model gradients and returned by compute_gradients(). +ModelGradients = Union[List[Tuple[TensorType, TensorType]], List[TensorType]] + +# Type of dict returned by get_weights() representing model weights. +ModelWeights = dict + +# An input dict used for direct ModelV2 calls. +ModelInputDict = Dict[str, TensorType] + +# Some kind of sample batch. +SampleBatchType = Union["SampleBatch", "MultiAgentBatch", Dict[str, Any]] + +# A (possibly nested) space struct: Either a gym.spaces.Space or a +# (possibly nested) dict|tuple of gym.space.Spaces. +SpaceStruct = Union[gym.spaces.Space, dict, tuple] + +# A list of batches of RNN states. +# Each item in this list has dimension [B, S] (S=state vector size) +StateBatches = List[List[Any]] # @OldAPIStack + +# Format of data output from policy forward pass. +# __sphinx_doc_begin_policy_output_type__ +PolicyOutputType = Tuple[TensorStructType, StateBatches, Dict] # @OldAPIStack +# __sphinx_doc_end_policy_output_type__ + + +# __sphinx_doc_begin_agent_connector_data_type__ +@OldAPIStack +class AgentConnectorDataType: + """Data type that is fed into and yielded from agent connectors. + + Args: + env_id: ID of the environment. + agent_id: ID to help identify the agent from which the data is received. + data: A payload (``data``). With RLlib's default sampler, the payload + is a dictionary of arbitrary data columns (obs, rewards, terminateds, + truncateds, etc). + """ + + def __init__(self, env_id: str, agent_id: str, data: Any): + self.env_id = env_id + self.agent_id = agent_id + self.data = data + + +# __sphinx_doc_end_agent_connector_data_type__ + + +# __sphinx_doc_begin_action_connector_output__ +@OldAPIStack +class ActionConnectorDataType: + """Data type that is fed into and yielded from agent connectors. + + Args: + env_id: ID of the environment. + agent_id: ID to help identify the agent from which the data is received. + input_dict: Input data that was passed into the policy. + Sometimes output must be adapted based on the input, for example + action masking. So the entire input data structure is provided here. + output: An object of PolicyOutputType. It is is composed of the + action output, the internal state output, and additional data fetches. + + """ + + def __init__( + self, + env_id: str, + agent_id: str, + input_dict: TensorStructType, + output: PolicyOutputType, + ): + self.env_id = env_id + self.agent_id = agent_id + self.input_dict = input_dict + self.output = output + + +# __sphinx_doc_end_action_connector_output__ + + +# __sphinx_doc_begin_agent_connector_output__ +@OldAPIStack +class AgentConnectorsOutput: + """Final output data type of agent connectors. + + Args are populated depending on the AgentConnector settings. + The branching happens in ViewRequirementAgentConnector. + + Args: + raw_dict: The raw input dictionary that sampler can use to + build episodes and training batches. + This raw dict also gets passed into ActionConnectors in case + it contains data useful for action adaptation (e.g. action masks). + sample_batch: The SampleBatch that can be immediately used for + querying the policy for next action. + """ + + def __init__( + self, raw_dict: Dict[str, TensorStructType], sample_batch: "SampleBatch" + ): + self.raw_dict = raw_dict + self.sample_batch = sample_batch + + +# __sphinx_doc_end_agent_connector_output__ + + +# Generic type var. +T = TypeVar("T")