diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/__pycache__/__init__.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/__pycache__/__init__.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..e730866595c116798a29cb83852fcc2ac0094ec0 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/__pycache__/__init__.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/connectors/action/__init__.py b/.venv/lib/python3.11/site-packages/ray/rllib/connectors/action/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/connectors/action/__pycache__/__init__.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/connectors/action/__pycache__/__init__.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..5839518d4863e03ad2f4c253a0fe798279cd0dca Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/connectors/action/__pycache__/__init__.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/connectors/action/__pycache__/clip.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/connectors/action/__pycache__/clip.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..1ed977ab10163a9c5c0f8748a000d5662887991b Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/connectors/action/__pycache__/clip.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/connectors/action/__pycache__/immutable.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/connectors/action/__pycache__/immutable.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..f010f33588579f2bc564d01233eedb12281f7bc0 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/connectors/action/__pycache__/immutable.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/connectors/action/__pycache__/lambdas.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/connectors/action/__pycache__/lambdas.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..a8190bece945b48612ac43b898171cce588479e9 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/connectors/action/__pycache__/lambdas.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/connectors/action/__pycache__/normalize.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/connectors/action/__pycache__/normalize.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..24e3f66443ac1e511446ea5ee46410fb5b6ee547 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/connectors/action/__pycache__/normalize.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/connectors/action/__pycache__/pipeline.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/connectors/action/__pycache__/pipeline.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..781bea611eaa5a122a4a13eff9a54cdb0a906a56 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/connectors/action/__pycache__/pipeline.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/connectors/action/clip.py b/.venv/lib/python3.11/site-packages/ray/rllib/connectors/action/clip.py new file mode 100644 index 0000000000000000000000000000000000000000..da7c8b97bf927bf4d97c7feed8285faa55ac89cf --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/rllib/connectors/action/clip.py @@ -0,0 +1,41 @@ +from typing import Any + +from ray.rllib.connectors.connector import ( + ActionConnector, + ConnectorContext, +) +from ray.rllib.connectors.registry import register_connector +from ray.rllib.utils.spaces.space_utils import clip_action, get_base_struct_from_space +from ray.rllib.utils.typing import ActionConnectorDataType +from ray.rllib.utils.annotations import OldAPIStack + + +@OldAPIStack +class ClipActionsConnector(ActionConnector): + def __init__(self, ctx: ConnectorContext): + super().__init__(ctx) + + self._action_space_struct = get_base_struct_from_space(ctx.action_space) + + def transform(self, ac_data: ActionConnectorDataType) -> ActionConnectorDataType: + assert isinstance( + ac_data.output, tuple + ), "Action connector requires PolicyOutputType data." + + actions, states, fetches = ac_data.output + return ActionConnectorDataType( + ac_data.env_id, + ac_data.agent_id, + ac_data.input_dict, + (clip_action(actions, self._action_space_struct), states, fetches), + ) + + def to_state(self): + return ClipActionsConnector.__name__, None + + @staticmethod + def from_state(ctx: ConnectorContext, params: Any): + return ClipActionsConnector(ctx) + + +register_connector(ClipActionsConnector.__name__, ClipActionsConnector) diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/connectors/action/immutable.py b/.venv/lib/python3.11/site-packages/ray/rllib/connectors/action/immutable.py new file mode 100644 index 0000000000000000000000000000000000000000..3f5c8bbd197cb6345cf07e712f2477897e25766b --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/rllib/connectors/action/immutable.py @@ -0,0 +1,40 @@ +from typing import Any + +import tree # pip install dm_tree + +from ray.rllib.connectors.connector import ( + ActionConnector, + ConnectorContext, +) +from ray.rllib.connectors.registry import register_connector +from ray.rllib.utils.numpy import make_action_immutable +from ray.rllib.utils.typing import ActionConnectorDataType +from ray.rllib.utils.annotations import OldAPIStack + + +@OldAPIStack +class ImmutableActionsConnector(ActionConnector): + def transform(self, ac_data: ActionConnectorDataType) -> ActionConnectorDataType: + assert isinstance( + ac_data.output, tuple + ), "Action connector requires PolicyOutputType data." + + actions, states, fetches = ac_data.output + tree.traverse(make_action_immutable, actions, top_down=False) + + return ActionConnectorDataType( + ac_data.env_id, + ac_data.agent_id, + ac_data.input_dict, + (actions, states, fetches), + ) + + def to_state(self): + return ImmutableActionsConnector.__name__, None + + @staticmethod + def from_state(ctx: ConnectorContext, params: Any): + return ImmutableActionsConnector(ctx) + + +register_connector(ImmutableActionsConnector.__name__, ImmutableActionsConnector) diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/connectors/action/lambdas.py b/.venv/lib/python3.11/site-packages/ray/rllib/connectors/action/lambdas.py new file mode 100644 index 0000000000000000000000000000000000000000..3bf862dd834d57e8b77a677c5f6fa7df8755779e --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/rllib/connectors/action/lambdas.py @@ -0,0 +1,76 @@ +from typing import Any, Callable, Dict, Type + +from ray.rllib.connectors.connector import ( + ActionConnector, + ConnectorContext, +) +from ray.rllib.connectors.registry import register_connector +from ray.rllib.utils.numpy import convert_to_numpy +from ray.rllib.utils.typing import ( + ActionConnectorDataType, + PolicyOutputType, + StateBatches, + TensorStructType, +) +from ray.rllib.utils.annotations import OldAPIStack + + +@OldAPIStack +def register_lambda_action_connector( + name: str, fn: Callable[[TensorStructType, StateBatches, Dict], PolicyOutputType] +) -> Type[ActionConnector]: + """A util to register any function transforming PolicyOutputType as an ActionConnector. + + The only requirement is that fn should take actions, states, and fetches as input, + and return transformed actions, states, and fetches. + + Args: + name: Name of the resulting actor connector. + fn: The function that transforms PolicyOutputType. + + Returns: + A new ActionConnector class that transforms PolicyOutputType using fn. + """ + + class LambdaActionConnector(ActionConnector): + def transform( + self, ac_data: ActionConnectorDataType + ) -> ActionConnectorDataType: + assert isinstance( + ac_data.output, tuple + ), "Action connector requires PolicyOutputType data." + + actions, states, fetches = ac_data.output + return ActionConnectorDataType( + ac_data.env_id, + ac_data.agent_id, + ac_data.input_dict, + fn(actions, states, fetches), + ) + + def to_state(self): + return name, None + + @staticmethod + def from_state(ctx: ConnectorContext, params: Any): + return LambdaActionConnector(ctx) + + LambdaActionConnector.__name__ = name + LambdaActionConnector.__qualname__ = name + + register_connector(name, LambdaActionConnector) + + return LambdaActionConnector + + +# Convert actions and states into numpy arrays if necessary. +ConvertToNumpyConnector = OldAPIStack( + register_lambda_action_connector( + "ConvertToNumpyConnector", + lambda actions, states, fetches: ( + convert_to_numpy(actions), + convert_to_numpy(states), + fetches, + ), + ), +) diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/connectors/action/normalize.py b/.venv/lib/python3.11/site-packages/ray/rllib/connectors/action/normalize.py new file mode 100644 index 0000000000000000000000000000000000000000..67c3731469a76556350cd327f276f82765a22cff --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/rllib/connectors/action/normalize.py @@ -0,0 +1,44 @@ +from typing import Any + +from ray.rllib.connectors.connector import ( + ActionConnector, + ConnectorContext, +) +from ray.rllib.connectors.registry import register_connector +from ray.rllib.utils.spaces.space_utils import ( + get_base_struct_from_space, + unsquash_action, +) +from ray.rllib.utils.typing import ActionConnectorDataType +from ray.rllib.utils.annotations import OldAPIStack + + +@OldAPIStack +class NormalizeActionsConnector(ActionConnector): + def __init__(self, ctx: ConnectorContext): + super().__init__(ctx) + + self._action_space_struct = get_base_struct_from_space(ctx.action_space) + + def transform(self, ac_data: ActionConnectorDataType) -> ActionConnectorDataType: + assert isinstance( + ac_data.output, tuple + ), "Action connector requires PolicyOutputType data." + + actions, states, fetches = ac_data.output + return ActionConnectorDataType( + ac_data.env_id, + ac_data.agent_id, + ac_data.input_dict, + (unsquash_action(actions, self._action_space_struct), states, fetches), + ) + + def to_state(self): + return NormalizeActionsConnector.__name__, None + + @staticmethod + def from_state(ctx: ConnectorContext, params: Any): + return NormalizeActionsConnector(ctx) + + +register_connector(NormalizeActionsConnector.__name__, NormalizeActionsConnector) diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/connectors/action/pipeline.py b/.venv/lib/python3.11/site-packages/ray/rllib/connectors/action/pipeline.py new file mode 100644 index 0000000000000000000000000000000000000000..a93fd3eb340ebd40fb5c7ec23b0bf5960735b992 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/rllib/connectors/action/pipeline.py @@ -0,0 +1,61 @@ +import logging +from typing import Any, List +from collections import defaultdict + +from ray.rllib.connectors.connector import ( + ActionConnector, + Connector, + ConnectorContext, + ConnectorPipeline, +) +from ray.rllib.connectors.registry import get_connector, register_connector +from ray.rllib.utils.annotations import OldAPIStack +from ray.rllib.utils.typing import ActionConnectorDataType +from ray.util.timer import _Timer + + +logger = logging.getLogger(__name__) + + +@OldAPIStack +class ActionConnectorPipeline(ConnectorPipeline, ActionConnector): + def __init__(self, ctx: ConnectorContext, connectors: List[Connector]): + super().__init__(ctx, connectors) + self.timers = defaultdict(_Timer) + + def __call__(self, ac_data: ActionConnectorDataType) -> ActionConnectorDataType: + for c in self.connectors: + timer = self.timers[str(c)] + with timer: + ac_data = c(ac_data) + return ac_data + + def to_state(self): + children = [] + for c in self.connectors: + state = c.to_state() + assert isinstance(state, tuple) and len(state) == 2, ( + "Serialized connector state must be in the format of " + f"Tuple[name: str, params: Any]. Instead we got {state}" + f"for connector {c.__name__}." + ) + children.append(state) + return ActionConnectorPipeline.__name__, children + + @staticmethod + def from_state(ctx: ConnectorContext, params: Any): + assert ( + type(params) is list + ), "ActionConnectorPipeline takes a list of connector params." + connectors = [] + for state in params: + try: + name, subparams = state + connectors.append(get_connector(name, ctx, subparams)) + except Exception as e: + logger.error(f"Failed to de-serialize connector state: {state}") + raise e + return ActionConnectorPipeline(ctx, connectors) + + +register_connector(ActionConnectorPipeline.__name__, ActionConnectorPipeline) diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/connectors/agent/__pycache__/__init__.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/connectors/agent/__pycache__/__init__.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..a3eddb42be53f95b6fa0d13494d746f110fba37a Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/connectors/agent/__pycache__/__init__.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/connectors/agent/__pycache__/clip_reward.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/connectors/agent/__pycache__/clip_reward.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..7286021251ba9629eccf2b4fbd23cd69f293291e Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/connectors/agent/__pycache__/clip_reward.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/connectors/agent/__pycache__/env_sampling.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/connectors/agent/__pycache__/env_sampling.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..b3384eb2840d4c8c6cce8539ffbaf57c1c7c188d Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/connectors/agent/__pycache__/env_sampling.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/connectors/agent/__pycache__/pipeline.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/connectors/agent/__pycache__/pipeline.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..8a0862f46d8d07d38277d67e1f34834f079cda10 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/connectors/agent/__pycache__/pipeline.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/connectors/agent/__pycache__/synced_filter.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/connectors/agent/__pycache__/synced_filter.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..296c43114244772df4105befb5360ca3dcb11e00 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/connectors/agent/__pycache__/synced_filter.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/connectors/agent/__pycache__/view_requirement.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/connectors/agent/__pycache__/view_requirement.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..0ca2390f5eec0d6a30e6153446c0906241f02d25 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/connectors/agent/__pycache__/view_requirement.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/connectors/common/__init__.py b/.venv/lib/python3.11/site-packages/ray/rllib/connectors/common/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..cf28ba9ae9fbc2924b9882a5ff5d476a772a2697 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/rllib/connectors/common/__init__.py @@ -0,0 +1,22 @@ +from ray.rllib.connectors.common.add_observations_from_episodes_to_batch import ( + AddObservationsFromEpisodesToBatch, +) +from ray.rllib.connectors.common.add_states_from_episodes_to_batch import ( + AddStatesFromEpisodesToBatch, +) +from ray.rllib.connectors.common.add_time_dim_to_batch_and_zero_pad import ( + AddTimeDimToBatchAndZeroPad, +) +from ray.rllib.connectors.common.agent_to_module_mapping import AgentToModuleMapping +from ray.rllib.connectors.common.batch_individual_items import BatchIndividualItems +from ray.rllib.connectors.common.numpy_to_tensor import NumpyToTensor + + +__all__ = [ + "AddObservationsFromEpisodesToBatch", + "AddStatesFromEpisodesToBatch", + "AddTimeDimToBatchAndZeroPad", + "AgentToModuleMapping", + "BatchIndividualItems", + "NumpyToTensor", +] diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/connectors/common/__pycache__/__init__.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/connectors/common/__pycache__/__init__.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..f3f8444307c76391aed886fd63d73ac52a2414a2 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/connectors/common/__pycache__/__init__.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/connectors/common/__pycache__/add_observations_from_episodes_to_batch.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/connectors/common/__pycache__/add_observations_from_episodes_to_batch.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..f995a98ed27a3c6bbc6d9a39af8dec4f8039044f Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/connectors/common/__pycache__/add_observations_from_episodes_to_batch.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/connectors/common/__pycache__/add_states_from_episodes_to_batch.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/connectors/common/__pycache__/add_states_from_episodes_to_batch.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..8aef8133e225593f94c9054b6ba21f4418713ebc Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/connectors/common/__pycache__/add_states_from_episodes_to_batch.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/connectors/common/__pycache__/add_time_dim_to_batch_and_zero_pad.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/connectors/common/__pycache__/add_time_dim_to_batch_and_zero_pad.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..6a189e908db71fafccb06e2ced960aa195b7a6ea Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/connectors/common/__pycache__/add_time_dim_to_batch_and_zero_pad.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/connectors/common/__pycache__/agent_to_module_mapping.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/connectors/common/__pycache__/agent_to_module_mapping.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..eb5c5e4682429cb6e682657fe519e8fb6de5f61f Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/connectors/common/__pycache__/agent_to_module_mapping.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/connectors/common/__pycache__/batch_individual_items.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/connectors/common/__pycache__/batch_individual_items.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..000def7a4eb1a79b21d3198cbb38bb503b4859ad Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/connectors/common/__pycache__/batch_individual_items.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/connectors/common/__pycache__/frame_stacking.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/connectors/common/__pycache__/frame_stacking.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..f258c70bf3405caffe84ee44a3f93f91390ac4d7 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/connectors/common/__pycache__/frame_stacking.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/connectors/common/__pycache__/module_to_agent_unmapping.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/connectors/common/__pycache__/module_to_agent_unmapping.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..133d91fbda8ab0617407d13e43ea03ce804121d3 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/connectors/common/__pycache__/module_to_agent_unmapping.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/connectors/common/__pycache__/numpy_to_tensor.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/connectors/common/__pycache__/numpy_to_tensor.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..8e5a6030c2b226a254e44eb0c7c537b3a45c0892 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/connectors/common/__pycache__/numpy_to_tensor.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/connectors/common/__pycache__/tensor_to_numpy.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/connectors/common/__pycache__/tensor_to_numpy.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..71dc34c995da6f677f084ba30a8b29575cb2b607 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/connectors/common/__pycache__/tensor_to_numpy.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/connectors/common/add_observations_from_episodes_to_batch.py b/.venv/lib/python3.11/site-packages/ray/rllib/connectors/common/add_observations_from_episodes_to_batch.py new file mode 100644 index 0000000000000000000000000000000000000000..54fab7b064c55fd00621cb0ebf052690da63a698 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/rllib/connectors/common/add_observations_from_episodes_to_batch.py @@ -0,0 +1,180 @@ +from typing import Any, Dict, List, Optional + +import gymnasium as gym + +from ray.rllib.core.columns import Columns +from ray.rllib.connectors.connector_v2 import ConnectorV2 +from ray.rllib.core.rl_module.rl_module import RLModule +from ray.rllib.utils.annotations import override +from ray.rllib.utils.typing import EpisodeType +from ray.util.annotations import PublicAPI + + +@PublicAPI(stability="alpha") +class AddObservationsFromEpisodesToBatch(ConnectorV2): + """Gets the last observation from a running episode and adds it to the batch. + + Note: This is one of the default env-to-module or Learner ConnectorV2 pieces that + are added automatically by RLlib into every env-to-module/Learner connector + pipeline, unless `config.add_default_connectors_to_env_to_module_pipeline` or + `config.add_default_connectors_to_learner_pipeline ` are set to + False. + + The default env-to-module connector pipeline is: + [ + [0 or more user defined ConnectorV2 pieces], + AddObservationsFromEpisodesToBatch, + AddTimeDimToBatchAndZeroPad, + AddStatesFromEpisodesToBatch, + AgentToModuleMapping, # only in multi-agent setups! + BatchIndividualItems, + NumpyToTensor, + ] + The default Learner connector pipeline is: + [ + [0 or more user defined ConnectorV2 pieces], + AddObservationsFromEpisodesToBatch, + AddColumnsFromEpisodesToTrainBatch, + AddTimeDimToBatchAndZeroPad, + AddStatesFromEpisodesToBatch, + AgentToModuleMapping, # only in multi-agent setups! + BatchIndividualItems, + NumpyToTensor, + ] + + This ConnectorV2: + - Operates on a list of Episode objects (single- or multi-agent). + - Gets the most recent observation(s) from all the given episodes and adds them + to the batch under construction (as a list of individual observations). + - Does NOT alter any observations (or other data) in the given episodes. + - Can be used in EnvToModule and Learner connector pipelines. + + .. testcode:: + + import gymnasium as gym + import numpy as np + + from ray.rllib.connectors.common import AddObservationsFromEpisodesToBatch + from ray.rllib.env.single_agent_episode import SingleAgentEpisode + from ray.rllib.utils.test_utils import check + + # Create two dummy SingleAgentEpisodes, each containing 2 observations, + # 1 action and 1 reward (both are length=1). + obs_space = gym.spaces.Box(-1.0, 1.0, (2,), np.float32) + act_space = gym.spaces.Discrete(2) + + episodes = [SingleAgentEpisode( + observations=[obs_space.sample(), obs_space.sample()], + actions=[act_space.sample()], + rewards=[1.0], + len_lookback_buffer=0, + ) for _ in range(2)] + eps_1_last_obs = episodes[0].get_observations(-1) + eps_2_last_obs = episodes[1].get_observations(-1) + print(f"1st Episode's last obs is {eps_1_last_obs}") + print(f"2nd Episode's last obs is {eps_2_last_obs}") + + # Create an instance of this class. + connector = AddObservationsFromEpisodesToBatch() + + # Call the connector with the two created episodes. + # Note that this particular connector works without an RLModule, so we + # simplify here for the sake of this example. + output_batch = connector( + rl_module=None, + batch={}, + episodes=episodes, + explore=True, + shared_data={}, + ) + # The output data should now contain the last observations of both episodes, + # in a "per-episode organized" fashion. + check( + output_batch, + { + "obs": { + (episodes[0].id_,): [eps_1_last_obs], + (episodes[1].id_,): [eps_2_last_obs], + }, + }, + ) + """ + + def __init__( + self, + input_observation_space: Optional[gym.Space] = None, + input_action_space: Optional[gym.Space] = None, + *, + as_learner_connector: bool = False, + **kwargs, + ): + """Initializes a AddObservationsFromEpisodesToBatch instance. + + Args: + as_learner_connector: Whether this connector is part of a Learner connector + pipeline, as opposed to a env-to-module pipeline. As a Learner + connector, it will add an entire Episode's observations (each timestep) + to the batch. + """ + super().__init__( + input_observation_space=input_observation_space, + input_action_space=input_action_space, + **kwargs, + ) + + self._as_learner_connector = as_learner_connector + + @override(ConnectorV2) + def __call__( + self, + *, + rl_module: RLModule, + batch: Dict[str, Any], + episodes: List[EpisodeType], + explore: Optional[bool] = None, + shared_data: Optional[dict] = None, + **kwargs, + ) -> Any: + # If "obs" already in data, early out. + if Columns.OBS in batch: + return batch + for i, sa_episode in enumerate( + self.single_agent_episode_iterator( + episodes, + # If Learner connector, get all episodes (for train batch). + # If EnvToModule, get only those ongoing episodes that just had their + # agent step (b/c those are the ones we need to compute actions for + # next). + agents_that_stepped_only=not self._as_learner_connector, + ) + ): + if self._as_learner_connector: + # TODO (sven): Resolve this hack by adding a new connector piece that + # performs this very task. + if "_" not in sa_episode.id_: + sa_episode.id_ += "_" + str(i) + + self.add_n_batch_items( + batch, + Columns.OBS, + # Add all observations, except the very last one. + # For a terminated episode, this is the terminal observation that + # has no value for training. + # For a truncated episode, algorithms either add an extra NEXT_OBS + # column to the batch (ex. DQN) or extend the episode length by one + # (using a separate connector piece and this truncated last obs), + # then bootstrap the value estimation for that extra timestep. + items_to_add=sa_episode.get_observations(slice(0, len(sa_episode))), + num_items=len(sa_episode), + single_agent_episode=sa_episode, + ) + else: + assert not sa_episode.is_numpy + self.add_batch_item( + batch, + Columns.OBS, + item_to_add=sa_episode.get_observations(-1), + single_agent_episode=sa_episode, + ) + + return batch diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/connectors/common/add_states_from_episodes_to_batch.py b/.venv/lib/python3.11/site-packages/ray/rllib/connectors/common/add_states_from_episodes_to_batch.py new file mode 100644 index 0000000000000000000000000000000000000000..9e211dd255728c0668832eb3fdd65ea94cd4e98a --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/rllib/connectors/common/add_states_from_episodes_to_batch.py @@ -0,0 +1,348 @@ +import math +from typing import Any, Dict, List, Optional + +import gymnasium as gym +import numpy as np +import tree # pip install dm_tree + +from ray.rllib.connectors.connector_v2 import ConnectorV2 +from ray.rllib.core import DEFAULT_MODULE_ID +from ray.rllib.core.columns import Columns +from ray.rllib.core.rl_module.multi_rl_module import MultiRLModule +from ray.rllib.core.rl_module.rl_module import RLModule +from ray.rllib.utils.annotations import override +from ray.rllib.utils.numpy import convert_to_numpy +from ray.rllib.utils.typing import EpisodeType +from ray.util.annotations import PublicAPI + + +@PublicAPI(stability="alpha") +class AddStatesFromEpisodesToBatch(ConnectorV2): + """Gets last STATE_OUT from running episode and adds it as STATE_IN to the batch. + + Note: This is one of the default env-to-module or Learner ConnectorV2 pieces that + are added automatically by RLlib into every env-to-module/Learner connector + pipeline, unless `config.add_default_connectors_to_env_to_module_pipeline` or + `config.add_default_connectors_to_learner_pipeline ` are set to + False. + + The default env-to-module connector pipeline is: + [ + [0 or more user defined ConnectorV2 pieces], + AddObservationsFromEpisodesToBatch, + AddTimeDimToBatchAndZeroPad, + AddStatesFromEpisodesToBatch, + AgentToModuleMapping, # only in multi-agent setups! + BatchIndividualItems, + NumpyToTensor, + ] + The default Learner connector pipeline is: + [ + [0 or more user defined ConnectorV2 pieces], + AddObservationsFromEpisodesToBatch, + AddColumnsFromEpisodesToTrainBatch, + AddTimeDimToBatchAndZeroPad, + AddStatesFromEpisodesToBatch, + AgentToModuleMapping, # only in multi-agent setups! + BatchIndividualItems, + NumpyToTensor, + ] + + If the RLModule is stateful, the episodes' STATE_OUTS will be extracted + and restructured under a new STATE_IN key. + As a Learner connector, the resulting STATE_IN batch has the shape (B', ...). + Here, B' is the sum of splits we have to do over the given episodes, such that each + chunk is at most `max_seq_len` long (T-axis). + As a EnvToModule connector, the resulting STATE_IN batch simply consists of n + states coming from n vectorized environments/episodes. + + Also, all other data (observations, rewards, etc.. if applicable) will be properly + reshaped into (B, T=max_seq_len (learner) or 1 (env-to-module), ...) and will be + zero-padded, if necessary. + + This ConnectorV2: + - Operates on a list of Episode objects. + - Gets the most recent STATE_OUT from all the given episodes and adds them under + the STATE_IN key to the batch under construction. + - Does NOT alter any data in the given episodes. + - Can be used in EnvToModule and Learner connector pipelines. + + .. testcode:: + + from ray.rllib.connectors.common import AddStatesFromEpisodesToBatch + from ray.rllib.core.columns import Columns + from ray.rllib.env.single_agent_episode import SingleAgentEpisode + from ray.rllib.utils.test_utils import check + + # Create a simple dummy class, pretending to be an RLModule with + # `get_initial_state`, `is_stateful` and `model_config` property defined: + class MyStateModule: + # dummy config + model_config = {"max_seq_len": 2} + + def is_stateful(self): + return True + + def get_initial_state(self): + return 0.0 + + + # Create an empty episode. The connector should use the RLModule's initial state + # to populate STATE_IN for the next forward pass. + episode = SingleAgentEpisode() + + rl_module = MyStateModule() + rl_module_init_state = rl_module.get_initial_state() + + # Create an instance of this class (as a env-to-module connector). + connector = AddStatesFromEpisodesToBatch(as_learner_connector=False) + + # Call the connector. + output_batch = connector( + rl_module=rl_module, + batch={}, + episodes=[episode], + shared_data={}, + ) + # The output data's STATE_IN key should now contain the RLModule's initial state + # plus the one state out found in the episode in a "per-episode organized" + # fashion. + check( + output_batch[Columns.STATE_IN], + { + (episode.id_,): [rl_module_init_state], + }, + ) + + # Create a SingleAgentEpisodes containing 5 observations, + # 4 actions and 4 rewards, and 4 STATE_OUTs. + # The same connector should now use the episode-stored last STATE_OUT as + # STATE_IN for the next forward pass. + episode = SingleAgentEpisode( + observations=[0, 1, 2, 3, 4], + actions=[1, 2, 3, 4], + rewards=[1.0, 2.0, 3.0, 4.0], + # STATE_OUT in episode will show up under STATE_IN in the batch. + extra_model_outputs={ + Columns.STATE_OUT: [-4.0, -3.0, -2.0, -1.0], + }, + len_lookback_buffer = 0, + ) + + # Call the connector. + output_batch = connector( + rl_module=rl_module, + batch={}, + episodes=[episode], + shared_data={}, + ) + # The output data's STATE_IN key should now contain the episode's last + # STATE_OUT, NOT the RLModule's initial state in a "per-episode organized" + # fashion. + check( + output_batch[Columns.STATE_IN], + { + # Expect the episode's last STATE_OUT. + (episode.id_,): [-1.0], + }, + ) + + # Create a new connector as a learner connector with a RNN seq len of 2 (for + # testing purposes only). Passing the same data through this learner connector, + # we expect the STATE_IN data to contain a) the initial module state and then + # every 2nd STATE_OUT stored in the episode. + connector = AddStatesFromEpisodesToBatch(as_learner_connector=True) + + # Call the connector. + output_batch = connector( + rl_module=rl_module, + batch={}, + episodes=[episode], + shared_data={}, + ) + check( + output_batch[Columns.STATE_IN], + { + # Expect initial module state + every 2nd STATE_OUT from episode, but + # not the very last one (just like the very last observation, this data + # is NOT passed through the forward_train, b/c there is nothing to learn + # at that timestep, unless we need to compute e.g. bootstrap value + # predictions). + # Also note that the different STATE_IN timesteps are already present + # as one batched item per episode in the list. + (episode.id_,): [rl_module_init_state, -3.0], + }, + ) + """ + + def __init__( + self, + input_observation_space: Optional[gym.Space] = None, + input_action_space: Optional[gym.Space] = None, + *, + as_learner_connector: bool = False, + **kwargs, + ): + """Initializes a AddObservationsFromEpisodesToBatch instance. + + Args: + as_learner_connector: Whether this connector is part of a Learner connector + pipeline, as opposed to a env-to-module pipeline. As a Learner + connector, it will add an entire Episode's observations (each timestep) + to the batch. + """ + super().__init__( + input_observation_space=input_observation_space, + input_action_space=input_action_space, + **kwargs, + ) + + self._as_learner_connector = as_learner_connector + + @override(ConnectorV2) + def __call__( + self, + *, + rl_module: RLModule, + batch: Dict[str, Any], + episodes: List[EpisodeType], + explore: Optional[bool] = None, + shared_data: Optional[dict] = None, + **kwargs, + ) -> Any: + # If not stateful OR STATE_IN already in data, early out. + if not rl_module.is_stateful() or Columns.STATE_IN in batch: + return batch + + for sa_episode in self.single_agent_episode_iterator( + episodes, + # If Learner connector, get all episodes (for train batch). + # If EnvToModule, get only those ongoing episodes that just had their + # agent step (b/c those are the ones we need to compute actions for next). + agents_that_stepped_only=not self._as_learner_connector, + ): + if self._as_learner_connector: + # Multi-agent case: Extract correct single agent RLModule (to get its + # individual state). + if sa_episode.module_id is not None: + sa_module = rl_module[sa_episode.module_id] + else: + sa_module = ( + rl_module[DEFAULT_MODULE_ID] + if isinstance(rl_module, MultiRLModule) + else rl_module + ) + # This single-agent RLModule is NOT stateful -> Skip. + if not sa_module.is_stateful(): + continue + + max_seq_len = sa_module.model_config["max_seq_len"] + + # look_back_state.shape=([state-dim],) + look_back_state = ( + # Episode has a (reset) beginning -> Prepend initial + # state. + convert_to_numpy(sa_module.get_initial_state()) + if sa_episode.t_started == 0 + or (Columns.STATE_OUT not in sa_episode.extra_model_outputs) + # Episode starts somewhere in the middle (is a cut + # continuation chunk) -> Use previous chunk's last + # STATE_OUT as initial state. + else sa_episode.get_extra_model_outputs( + key=Columns.STATE_OUT, + indices=-1, + neg_index_as_lookback=True, + ) + ) + # If we have `"state_out"`s (e.g. from rollouts) use them for the + # `"state_in"`s. + if Columns.STATE_OUT in sa_episode.extra_model_outputs: + # state_outs.shape=(T,[state-dim]) T=episode len + state_outs = sa_episode.get_extra_model_outputs( + key=Columns.STATE_OUT + ) + # Otherwise, we have no `"state_out"` (e.g. because we are sampling + # from offline data and the expert policy was not stateful). + else: + # Then simply use the `look_back_state`, i.e. in this case the + # initial state as `"state_in` in training. + if sa_episode.is_numpy: + state_outs = tree.map_structure( + lambda a, _sae=sa_episode: np.repeat( + a[np.newaxis, ...], len(_sae), axis=0 + ), + look_back_state, + ) + else: + state_outs = [look_back_state for _ in range(len(sa_episode))] + # Explanation: + # B=episode len // max_seq_len + # [::max_seq_len]: only keep every Tth state. + # [:-1]: Shift state outs by one; ignore very last + # STATE_OUT, but therefore add the lookback/init state at + # the beginning. + items_to_add = ( + tree.map_structure( + lambda i, o, m=max_seq_len: np.concatenate([[i], o[:-1]])[::m], + look_back_state, + state_outs, + ) + if sa_episode.is_numpy + else ([look_back_state] + state_outs[:-1])[::max_seq_len] + ) + self.add_n_batch_items( + batch=batch, + column=Columns.STATE_IN, + items_to_add=items_to_add, + num_items=int(math.ceil(len(sa_episode) / max_seq_len)), + single_agent_episode=sa_episode, + ) + if Columns.NEXT_OBS in batch: + items_to_add = ( + tree.map_structure( + lambda i, m=max_seq_len: i[::m], + state_outs, + ) + if sa_episode.is_numpy + else state_outs[::max_seq_len] + ) + self.add_n_batch_items( + batch=batch, + column=Columns.NEXT_STATE_IN, + items_to_add=items_to_add, + num_items=int(math.ceil(len(sa_episode) / max_seq_len)), + single_agent_episode=sa_episode, + ) + + else: + assert not sa_episode.is_numpy + + # Multi-agent case: Extract correct single agent RLModule (to get the + # state for individually). + sa_module = rl_module + if sa_episode.module_id is not None: + sa_module = rl_module[sa_episode.module_id] + # This single-agent RLModule is NOT stateful -> Skip. + if not sa_module.is_stateful(): + continue + + # Episode just started or has no `"state_out"` (e.g. in offline + # sampling) -> Get initial state from our RLModule. + if (sa_episode.t_started == 0 and len(sa_episode) == 0) or ( + Columns.STATE_OUT not in sa_episode.extra_model_outputs + ): + state = sa_module.get_initial_state() + # Episode is already ongoing -> Use most recent STATE_OUT. + else: + state = sa_episode.get_extra_model_outputs( + key=Columns.STATE_OUT, + indices=-1, + ) + self.add_batch_item( + batch, + Columns.STATE_IN, + item_to_add=state, + single_agent_episode=sa_episode, + ) + + return batch diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/connectors/common/add_time_dim_to_batch_and_zero_pad.py b/.venv/lib/python3.11/site-packages/ray/rllib/connectors/common/add_time_dim_to_batch_and_zero_pad.py new file mode 100644 index 0000000000000000000000000000000000000000..9d47e46340639c5579499adfb056aef47e927eb0 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/rllib/connectors/common/add_time_dim_to_batch_and_zero_pad.py @@ -0,0 +1,302 @@ +from typing import Any, Dict, List, Optional + +import gymnasium as gym +import numpy as np +import tree # pip install dm_tree + +from ray.rllib.connectors.connector_v2 import ConnectorV2 +from ray.rllib.core import DEFAULT_MODULE_ID +from ray.rllib.core.columns import Columns +from ray.rllib.core.rl_module.multi_rl_module import MultiRLModule +from ray.rllib.core.rl_module.rl_module import RLModule +from ray.rllib.utils.annotations import override +from ray.rllib.utils.postprocessing.zero_padding import ( + create_mask_and_seq_lens, + split_and_zero_pad, +) +from ray.rllib.utils.spaces.space_utils import BatchedNdArray +from ray.rllib.utils.typing import EpisodeType +from ray.util.annotations import PublicAPI + + +@PublicAPI(stability="alpha") +class AddTimeDimToBatchAndZeroPad(ConnectorV2): + """Adds an extra time dim (axis=1) to all data currently in the batch. + + Note: This is one of the default env-to-module or Learner ConnectorV2 pieces that + are added automatically by RLlib into every env-to-module/Learner connector + pipeline, unless `config.add_default_connectors_to_env_to_module_pipeline` or + `config.add_default_connectors_to_learner_pipeline ` are set to + False. + + The default env-to-module connector pipeline is: + [ + [0 or more user defined ConnectorV2 pieces], + AddObservationsFromEpisodesToBatch, + AddTimeDimToBatchAndZeroPad, + AddStatesFromEpisodesToBatch, + AgentToModuleMapping, # only in multi-agent setups! + BatchIndividualItems, + NumpyToTensor, + ] + The default Learner connector pipeline is: + [ + [0 or more user defined ConnectorV2 pieces], + AddObservationsFromEpisodesToBatch, + AddColumnsFromEpisodesToTrainBatch, + AddTimeDimToBatchAndZeroPad, + AddStatesFromEpisodesToBatch, + AgentToModuleMapping, # only in multi-agent setups! + BatchIndividualItems, + NumpyToTensor, + ] + + If the RLModule is stateful, an extra time dim at axis=1 is added to all data in the + batch. + + Also, all data (observations, rewards, etc.. if applicable) will be properly + reshaped into (B, T=max_seq_len (learner) or 1 (env-to-module), ...) and will be + zero-padded, if necessary. + + This ConnectorV2: + - Operates on a list of Episode objects. + - Adds a time dim at axis=1 to all columns already in the batch. + - In case of a learner connector pipeline, zero-pads the data according to the + module's `self.model_config["max_seq_len"]` setting and reshapes all data to + (B, T, ...). The connector also adds SEQ_LENS information and loss mask + information to the batch based on the added zero-padding. + - Does NOT alter any data in the given episodes. + - Can be used in EnvToModule and Learner connector pipelines. + + .. testcode:: + + from ray.rllib.connectors.common import AddTimeDimToBatchAndZeroPad + from ray.rllib.core.columns import Columns + from ray.rllib.env.single_agent_episode import SingleAgentEpisode + from ray.rllib.utils.test_utils import check + + + # Create a simple dummy class, pretending to be an RLModule with + # `get_initial_state`, `is_stateful` and `model_config` property defined: + class MyStateModule: + # dummy config + model_config = {"max_seq_len": 3} + + def is_stateful(self): + return True + + def get_initial_state(self): + return 0.0 + + + # Create an already reset episode. Expect the connector to add a time-dim to the + # reset observation. + episode = SingleAgentEpisode(observations=[0]) + + rl_module = MyStateModule() + + # Create an instance of this class (as an env-to-module connector). + connector = AddTimeDimToBatchAndZeroPad(as_learner_connector=False) + + # Call the connector. + output_batch = connector( + rl_module=rl_module, + batch={Columns.OBS: [0]}, + episodes=[episode], + shared_data={}, + ) + # The output data's OBS key should now be reshaped to (B, T) + check(output_batch[Columns.OBS], [[0]]) + + # Create a SingleAgentEpisodes containing 5 observations, + # 4 actions and 4 rewards. + episode = SingleAgentEpisode( + observations=[0, 1, 2, 3, 4], + actions=[1, 2, 3, 4], + rewards=[1.0, 2.0, 3.0, 4.0], + len_lookback_buffer=0, + ) + + # Call the connector. + output_batch = connector( + rl_module=rl_module, + batch={Columns.OBS: [4]}, + episodes=[episode], + shared_data={}, + ) + # The output data's OBS, ACTIONS, and REWARDS keys should now all have a time + # rank. + check( + # Expect the episode's last OBS. + output_batch[Columns.OBS], [[4]], + ) + + # Create a new connector as a learner connector with a RNN seq len of 4 (for + # testing purposes only). Passing the same data through this learner connector, + # we expect the data to also be zero-padded. + connector = AddTimeDimToBatchAndZeroPad(as_learner_connector=True) + + # Call the connector. + output_batch = connector( + rl_module=rl_module, + batch={Columns.OBS: {(episode.id_,): [0, 1, 2, 3]}}, + episodes=[episode], + shared_data={}, + ) + check(output_batch[Columns.OBS], {(episode.id_,): [[0, 1, 2], [3, 0, 0]]}) + """ + + def __init__( + self, + input_observation_space: Optional[gym.Space] = None, + input_action_space: Optional[gym.Space] = None, + *, + as_learner_connector: bool = False, + **kwargs, + ): + """Initializes a AddObservationsFromEpisodesToBatch instance. + + Args: + as_learner_connector: Whether this connector is part of a Learner connector + pipeline, as opposed to a env-to-module pipeline. As a Learner + connector, it will add an entire Episode's observations (each timestep) + to the batch. + """ + super().__init__( + input_observation_space=input_observation_space, + input_action_space=input_action_space, + **kwargs, + ) + + self._as_learner_connector = as_learner_connector + + @override(ConnectorV2) + def __call__( + self, + *, + rl_module: RLModule, + batch: Dict[str, Any], + episodes: List[EpisodeType], + explore: Optional[bool] = None, + shared_data: Optional[dict] = None, + **kwargs, + ) -> Any: + + # If not stateful OR STATE_IN already in data, early out. + if not rl_module.is_stateful() or Columns.STATE_IN in batch: + return batch + + # Make all inputs (other than STATE_IN) have an additional T-axis. + # Since data has not been batched yet (we are still operating on lists in the + # batch), we add this time axis as 0 (not 1). When we batch, the batch axis will + # be 0 and the time axis will be 1. + # Also, let module-to-env pipeline know that we had added a single timestep + # time rank to the data (to remove it again). + if not self._as_learner_connector: + for column in batch.keys(): + self.foreach_batch_item_change_in_place( + batch=batch, + column=column, + func=lambda item, eps_id, aid, mid: ( + item + if mid is not None and not rl_module[mid].is_stateful() + # Expand on axis 0 (the to-be-time-dim) if item has not been + # batched yet, otherwise axis=1 (the time-dim). + else tree.map_structure( + lambda s: np.expand_dims( + s, axis=(1 if isinstance(s, BatchedNdArray) else 0) + ), + item, + ) + ), + ) + shared_data["_added_single_ts_time_rank"] = True + else: + # Before adding STATE_IN to the `data`, zero-pad existing data and batch + # into max_seq_len chunks. + for column, column_data in batch.copy().items(): + # Do not zero-pad INFOS column. + if column == Columns.INFOS: + continue + for key, item_list in column_data.items(): + # Multi-agent case AND RLModule is not stateful -> Do not zero-pad + # for this model. + assert isinstance(key, tuple) + mid = None + if len(key) == 3: + eps_id, aid, mid = key + if not rl_module[mid].is_stateful(): + continue + column_data[key] = split_and_zero_pad( + item_list, + max_seq_len=self._get_max_seq_len(rl_module, module_id=mid), + ) + # TODO (sven): Remove this hint/hack once we are not relying on + # SampleBatch anymore (which has to set its property + # zero_padded=True when shuffling). + shared_data[ + ( + "_zero_padded_for_mid=" + f"{mid if mid is not None else DEFAULT_MODULE_ID}" + ) + ] = True + + for sa_episode in self.single_agent_episode_iterator( + # If Learner connector, get all episodes (for train batch). + # If EnvToModule, get only those ongoing episodes that just had their + # agent step (b/c those are the ones we need to compute actions for next). + episodes, + agents_that_stepped_only=False, + ): + # Multi-agent case: Extract correct single agent RLModule (to get its + # individual state). + if sa_episode.module_id is not None: + sa_module = rl_module[sa_episode.module_id] + else: + sa_module = ( + rl_module[DEFAULT_MODULE_ID] + if isinstance(rl_module, MultiRLModule) + else rl_module + ) + # This single-agent RLModule is NOT stateful -> Skip. + if not sa_module.is_stateful(): + continue + + max_seq_len = sa_module.model_config["max_seq_len"] + + # Also, create the loss mask (b/c of our now possibly zero-padded data) + # as well as the seq_lens array and add these to `data` as well. + mask, seq_lens = create_mask_and_seq_lens(len(sa_episode), max_seq_len) + self.add_n_batch_items( + batch=batch, + column=Columns.SEQ_LENS, + items_to_add=seq_lens, + num_items=len(seq_lens), + single_agent_episode=sa_episode, + ) + if not shared_data.get("_added_loss_mask_for_valid_episode_ts"): + self.add_n_batch_items( + batch=batch, + column=Columns.LOSS_MASK, + items_to_add=mask, + num_items=len(mask), + single_agent_episode=sa_episode, + ) + + return batch + + def _get_max_seq_len(self, rl_module, module_id=None): + if not isinstance(rl_module, MultiRLModule): + mod = rl_module + elif module_id: + mod = rl_module[module_id] + else: + mod = next(iter(rl_module.values())) + if "max_seq_len" not in mod.model_config: + raise ValueError( + "You are using a stateful RLModule and are not providing a " + "'max_seq_len' key inside your `model_config`. You can set this " + "dict and/or override keys in it via `config.rl_module(" + "model_config={'max_seq_len': [some int]})`." + ) + return mod.model_config["max_seq_len"] diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/connectors/common/agent_to_module_mapping.py b/.venv/lib/python3.11/site-packages/ray/rllib/connectors/common/agent_to_module_mapping.py new file mode 100644 index 0000000000000000000000000000000000000000..63bd78cf9b783804aedb63563e5c5fa80f79de50 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/rllib/connectors/common/agent_to_module_mapping.py @@ -0,0 +1,291 @@ +from collections import defaultdict +from typing import Any, Dict, List, Optional + +import gymnasium as gym + +from ray.rllib.connectors.connector_v2 import ConnectorV2 +from ray.rllib.core.rl_module.rl_module import RLModule, RLModuleSpec +from ray.rllib.env.multi_agent_episode import MultiAgentEpisode +from ray.rllib.utils.annotations import override +from ray.rllib.utils.typing import EpisodeType, ModuleID +from ray.util.annotations import PublicAPI + + +@PublicAPI(stability="alpha") +class AgentToModuleMapping(ConnectorV2): + """ConnectorV2 that performs mapping of data from AgentID based to ModuleID based. + + Note: This is one of the default env-to-module or Learner ConnectorV2 pieces that + are added automatically by RLlib into every env-to-module/Learner connector + pipeline, unless `config.add_default_connectors_to_env_to_module_pipeline` or + `config.add_default_connectors_to_learner_pipeline ` are set to + False. + + The default env-to-module connector pipeline is: + [ + [0 or more user defined ConnectorV2 pieces], + AddObservationsFromEpisodesToBatch, + AddTimeDimToBatchAndZeroPad, + AddStatesFromEpisodesToBatch, + AgentToModuleMapping, # only in multi-agent setups! + BatchIndividualItems, + NumpyToTensor, + ] + The default Learner connector pipeline is: + [ + [0 or more user defined ConnectorV2 pieces], + AddObservationsFromEpisodesToBatch, + AddColumnsFromEpisodesToTrainBatch, + AddTimeDimToBatchAndZeroPad, + AddStatesFromEpisodesToBatch, + AgentToModuleMapping, # only in multi-agent setups! + BatchIndividualItems, + NumpyToTensor, + ] + + This connector piece is only used by RLlib (as a default connector piece) in a + multi-agent setup. + + Note that before the mapping, `data` is expected to have the following + structure: + [col0]: + (eps_id0, ag0, mod0): [list of individual batch items] + (eps_id0, ag1, mod2): [list of individual batch items] + (eps_id1, ag0, mod1): [list of individual batch items] + [col1]: + etc.. + + The target structure of the above `data` would then be: + [mod0]: + [col0]: [batched data -> batch_size_B will be the number of all items in the + input data under col0 that have mod0 as their ModuleID] + [col1]: [batched data] + [mod1]: + [col0]: etc. + + Mapping happens in the following stages: + + 1) Under each column name, sort keys first by EpisodeID, then AgentID. + 2) Add ModuleID keys under each column name (no cost/extra memory) and map these + new keys to empty lists. + [col0] -> [mod0] -> []: Then push items that belong to mod0 into these lists. + 3) Perform batching on the per-module lists under each column: + [col0] -> [mod0]: [...] <- now batched data (numpy array or struct of numpy + arrays). + 4) Flip column names with ModuleIDs (no cost/extra memory): + [mod0]: + [col0]: [batched data] + etc.. + + Note that in order to unmap the resulting batch back into an AgentID based one, + we have to store the env vector index AND AgentID of each module's batch item + in an additionally returned `memorized_map_structure`. + + .. testcode:: + + from ray.rllib.connectors.env_to_module import AgentToModuleMapping + from ray.rllib.utils.test_utils import check + + batch = { + "obs": { + ("MA-EPS0", "agent0", "module0"): [0, 1, 2], + ("MA-EPS0", "agent1", "module1"): [3, 4, 5], + }, + "actions": { + ("MA-EPS1", "agent2", "module0"): [8], + ("MA-EPS0", "agent1", "module1"): [9], + }, + } + + # Create our connector piece. + connector = AgentToModuleMapping( + rl_module_specs={"module0", "module1"}, + agent_to_module_mapping_fn=( + lambda agent_id, eps: "module1" if agent_id == "agent1" else "module0" + ), + ) + + # Call the connector (and thereby flip from AgentID based to ModuleID based + # structure.. + output_batch = connector( + rl_module=None, # This particular connector works without an RLModule. + batch=batch, + episodes=[], # This particular connector works without a list of episodes. + explore=True, + shared_data={}, + ) + + # `data` should now be mapped from ModuleIDs to module data. + check( + output_batch, + { + "module0": { + "obs": [0, 1, 2], + "actions": [8], + }, + "module1": { + "obs": [3, 4, 5], + "actions": [9], + }, + }, + ) + """ + + @override(ConnectorV2) + def recompute_output_observation_space( + self, + input_observation_space: gym.Space, + input_action_space: gym.Space, + ) -> gym.Space: + return self._map_space_if_necessary(input_observation_space, "obs") + + @override(ConnectorV2) + def recompute_output_action_space( + self, + input_observation_space: gym.Space, + input_action_space: gym.Space, + ) -> gym.Space: + return self._map_space_if_necessary(input_action_space, "act") + + def __init__( + self, + input_observation_space: Optional[gym.Space] = None, + input_action_space: Optional[gym.Space] = None, + *, + rl_module_specs: Dict[ModuleID, RLModuleSpec], + agent_to_module_mapping_fn, + ): + super().__init__(input_observation_space, input_action_space) + + self._rl_module_specs = rl_module_specs + self._agent_to_module_mapping_fn = agent_to_module_mapping_fn + + @override(ConnectorV2) + def __call__( + self, + *, + rl_module: RLModule, + batch: Dict[str, Any], + episodes: List[EpisodeType], + explore: Optional[bool] = None, + shared_data: Optional[dict] = None, + **kwargs, + ) -> Any: + # Current agent to module mapping function. + # agent_to_module_mapping_fn = shared_data.get("agent_to_module_mapping_fn") + # Store in shared data, which module IDs map to which episode/agent, such + # that the module-to-env pipeline can map the data back to agents. + memorized_map_structure = defaultdict(list) + for column, agent_data in batch.items(): + if rl_module is not None and column in rl_module: + continue + for eps_id, agent_id, module_id in agent_data.keys(): + memorized_map_structure[module_id].append((eps_id, agent_id)) + # TODO (sven): We should check that all columns have the same struct. + break + + shared_data["memorized_map_structure"] = dict(memorized_map_structure) + + # Mapping from ModuleID to column data. + data_by_module = {} + + # Iterating over each column in the original data: + for column, agent_data in batch.items(): + if rl_module is not None and column in rl_module: + if column in data_by_module: + data_by_module[column].update(agent_data) + else: + data_by_module[column] = agent_data + continue + for ( + eps_id, + agent_id, + module_id, + ), values_batch_or_list in agent_data.items(): + assert isinstance(values_batch_or_list, list) + for value in values_batch_or_list: + if module_id not in data_by_module: + data_by_module[module_id] = {column: []} + elif column not in data_by_module[module_id]: + data_by_module[module_id][column] = [] + + # Append the data. + data_by_module[module_id][column].append(value) + + return data_by_module + + def _map_space_if_necessary(self, space: gym.Space, which: str = "obs"): + # Analyze input observation space to check, whether the user has already taken + # care of the agent to module mapping. + if set(self._rl_module_specs) == set(space.spaces.keys()): + return space + + # We need to take care of agent to module mapping. Figure out the resulting + # observation space here. + dummy_eps = MultiAgentEpisode() + + ret_space = {} + for module_id in self._rl_module_specs: + # Easy way out, user has provided space in the RLModule spec dict. + if ( + isinstance(self._rl_module_specs, dict) + and module_id in self._rl_module_specs + ): + if ( + which == "obs" + and self._rl_module_specs[module_id].observation_space + ): + ret_space[module_id] = self._rl_module_specs[ + module_id + ].observation_space + continue + elif which == "act" and self._rl_module_specs[module_id].action_space: + ret_space[module_id] = self._rl_module_specs[module_id].action_space + continue + + # Need to reverse map spaces (for the different agents) to certain + # module IDs (using a dummy MultiAgentEpisode). + one_space = next(iter(space.spaces.values())) + # If all obs spaces are the same anyway, just use the first + # single-agent space. + if all(s == one_space for s in space.spaces.values()): + ret_space[module_id] = one_space + # Otherwise, we have to compare the ModuleID with all possible + # AgentIDs and find the agent ID that matches. + else: + match_aid = None + one_agent_for_module_found = False + for aid in space.spaces.keys(): + # Match: Assign spaces for this agentID to the PolicyID. + if self._agent_to_module_mapping_fn(aid, dummy_eps) == module_id: + # Make sure, different agents that map to the same + # policy don't have different spaces. + if ( + module_id in ret_space + and space[aid] != ret_space[module_id] + ): + raise ValueError( + f"Two agents ({aid} and {match_aid}) in your " + "environment map to the same ModuleID (as per your " + "`agent_to_module_mapping_fn`), however, these agents " + "also have different observation spaces as per the env!" + ) + ret_space[module_id] = space[aid] + match_aid = aid + one_agent_for_module_found = True + # Still no space found for this module ID -> Error out. + if not one_agent_for_module_found: + raise ValueError( + f"Could not find or derive any {which}-space for RLModule " + f"{module_id}! This can happen if your `config.rl_module(rl_" + f"module_spec=...)` does NOT contain space information for this" + " particular single-agent module AND your agent-to-module-" + "mapping function is stochastic (such that for some agent A, " + "more than one ModuleID might be returned somewhat randomly). " + f"Fix this error by providing {which}-space information using " + "`config.rl_module(rl_module_spec=MultiRLModuleSpec(" + f"rl_module_specs={{'{module_id}': RLModuleSpec(" + "observation_space=..., action_space=...)}}))" + ) + + return gym.spaces.Dict(ret_space) diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/connectors/common/batch_individual_items.py b/.venv/lib/python3.11/site-packages/ray/rllib/connectors/common/batch_individual_items.py new file mode 100644 index 0000000000000000000000000000000000000000..8d3bd5a53cd7aa560d139145135f3a1f679b6238 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/rllib/connectors/common/batch_individual_items.py @@ -0,0 +1,200 @@ +from typing import Any, Dict, List, Optional + +import gymnasium as gym + +from ray.rllib.connectors.connector_v2 import ConnectorV2 +from ray.rllib.core import DEFAULT_MODULE_ID +from ray.rllib.core.columns import Columns +from ray.rllib.core.rl_module.multi_rl_module import MultiRLModule +from ray.rllib.core.rl_module.rl_module import RLModule +from ray.rllib.utils.annotations import override +from ray.rllib.utils.spaces.space_utils import batch as batch_fn +from ray.rllib.utils.typing import EpisodeType +from ray.util.annotations import PublicAPI + + +@PublicAPI(stability="alpha") +class BatchIndividualItems(ConnectorV2): + """Batches individual data-items (in lists) into tensors (with batch dimension). + + Note: This is one of the default env-to-module or Learner ConnectorV2 pieces that + are added automatically by RLlib into every env-to-module/Learner connector + pipeline, unless `config.add_default_connectors_to_env_to_module_pipeline` or + `config.add_default_connectors_to_learner_pipeline ` are set to + False. + + The default env-to-module connector pipeline is: + [ + [0 or more user defined ConnectorV2 pieces], + AddObservationsFromEpisodesToBatch, + AddTimeDimToBatchAndZeroPad, + AddStatesFromEpisodesToBatch, + AgentToModuleMapping, # only in multi-agent setups! + BatchIndividualItems, + NumpyToTensor, + ] + The default Learner connector pipeline is: + [ + [0 or more user defined ConnectorV2 pieces], + AddObservationsFromEpisodesToBatch, + AddColumnsFromEpisodesToTrainBatch, + AddTimeDimToBatchAndZeroPad, + AddStatesFromEpisodesToBatch, + AgentToModuleMapping, # only in multi-agent setups! + BatchIndividualItems, + NumpyToTensor, + ] + + This ConnectorV2: + - Operates only on the input `data`, NOT the incoming list of episode objects + (ignored). + - In the single-agent case, `data` must already be a dict, structured as follows by + prior connector pieces of the same pipeline: + [col0] -> {[(eps_id,)]: [list of individual batch items]} + - In the multi-agent case, `data` must already be a dict, structured as follows by + prior connector pieces of the same pipeline (in particular the + `AgentToModuleMapping` piece): + [module_id] -> [col0] -> [list of individual batch items] + - Translates the above data under the different columns (e.g. "obs") into final + (batched) structures. For the single-agent case, the output `data` looks like this: + [col0] -> [possibly complex struct of batches (at the leafs)]. + For the multi-agent case, the output `data` looks like this: + [module_id] -> [col0] -> [possibly complex struct of batches (at the leafs)]. + + .. testcode:: + + from ray.rllib.connectors.common import BatchIndividualItems + from ray.rllib.utils.test_utils import check + + single_agent_batch = { + "obs": { + # Note that at this stage, next-obs is not part of the data anymore .. + ("MA-EPS0",): [0, 1], + ("MA-EPS1",): [2, 3], + }, + "actions": { + # .. so we have as many actions per episode as we have observations. + ("MA-EPS0",): [4, 5], + ("MA-EPS1",): [6, 7], + }, + } + + # Create our (single-agent) connector piece. + connector = BatchIndividualItems() + + # Call the connector (and thereby batch the individual items). + output_batch = connector( + rl_module=None, # This particular connector works without an RLModule. + batch=single_agent_batch, + episodes=[], # This particular connector works without a list of episodes. + explore=True, + shared_data={}, + ) + + # `output_batch` should now be batched (episode IDs should have been removed + # from the struct). + check( + output_batch, + {"obs": [0, 1, 2, 3], "actions": [4, 5, 6, 7]}, + ) + """ + + def __init__( + self, + input_observation_space: Optional[gym.Space] = None, + input_action_space: Optional[gym.Space] = None, + *, + multi_agent: bool = False, + **kwargs, + ): + """Initializes a BatchIndividualItems instance. + + Args: + multi_agent: Whether this is a connector operating on a multi-agent + observation space mapping AgentIDs to individual agents' observations. + """ + super().__init__( + input_observation_space=input_observation_space, + input_action_space=input_action_space, + **kwargs, + ) + self._multi_agent = multi_agent + + @override(ConnectorV2) + def __call__( + self, + *, + rl_module: RLModule, + batch: Dict[str, Any], + episodes: List[EpisodeType], + explore: Optional[bool] = None, + shared_data: Optional[dict] = None, + **kwargs, + ) -> Any: + is_multi_rl_module = isinstance(rl_module, MultiRLModule) + + # Convert lists of individual items into properly batched data. + for column, column_data in batch.copy().items(): + # Multi-agent case: This connector piece should only be used after(!) + # the AgentToModuleMapping connector has already been applied, leading + # to a batch structure of: + # [module_id] -> [col0] -> [list of individual batch items] + if is_multi_rl_module and column in rl_module: + # Case, in which a column has already been properly batched before this + # connector piece is called. + if not self._multi_agent: + continue + # If MA Off-Policy and independent sampling we need to overcome this + # check. + module_data = column_data + for col, col_data in module_data.copy().items(): + if isinstance(col_data, list) and col != Columns.INFOS: + module_data[col] = batch_fn( + col_data, + individual_items_already_have_batch_dim="auto", + ) + + # Simple case: There is a list directly under `column`: + # Batch the list. + elif isinstance(column_data, list): + batch[column] = batch_fn( + column_data, + individual_items_already_have_batch_dim="auto", + ) + + # Single-agent case: There is a dict under `column` mapping + # `eps_id` to lists of items: + # Concat all these lists, then batch. + elif not self._multi_agent: + # TODO: only really need this in non-Learner connector pipeline + memorized_map_structure = [] + list_to_be_batched = [] + for (eps_id,) in column_data.keys(): + for item in column_data[(eps_id,)]: + # Only record structure for OBS column. + if column == Columns.OBS: + memorized_map_structure.append(eps_id) + list_to_be_batched.append(item) + # INFOS should not be batched (remain a list). + batch[column] = ( + list_to_be_batched + if column == Columns.INFOS + else batch_fn( + list_to_be_batched, + individual_items_already_have_batch_dim="auto", + ) + ) + if is_multi_rl_module: + if DEFAULT_MODULE_ID not in batch: + batch[DEFAULT_MODULE_ID] = {} + batch[DEFAULT_MODULE_ID][column] = batch.pop(column) + + # Only record structure for OBS column. + if column == Columns.OBS: + shared_data["memorized_map_structure"] = memorized_map_structure + # Multi-agent case: But Module ID not found in our RLModule -> Ignore this + # `module_id` entirely. + # else: + # pass + + return batch diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/connectors/common/frame_stacking.py b/.venv/lib/python3.11/site-packages/ray/rllib/connectors/common/frame_stacking.py new file mode 100644 index 0000000000000000000000000000000000000000..9a911a5e8ea10b3053822583cf9035bb4f1a9802 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/rllib/connectors/common/frame_stacking.py @@ -0,0 +1,147 @@ +import numpy as np +from typing import Any, Dict, List, Optional + +import gymnasium as gym +import tree # pip install dm_tree + +from ray.rllib.connectors.connector_v2 import ConnectorV2 +from ray.rllib.core.columns import Columns +from ray.rllib.core.rl_module.rl_module import RLModule +from ray.rllib.utils.annotations import override +from ray.rllib.utils.typing import EpisodeType +from ray.util.annotations import PublicAPI + + +@PublicAPI(stability="alpha") +class _FrameStacking(ConnectorV2): + """A connector piece that stacks the previous n observations into one.""" + + @override(ConnectorV2) + def recompute_output_observation_space( + self, + input_observation_space: gym.Space, + input_action_space: gym.Space, + ) -> gym.Space: + # Change our observation space according to the given stacking settings. + if self._multi_agent: + ret = {} + for agent_id, obs_space in input_observation_space.spaces.items(): + ret[agent_id] = self._convert_individual_space(obs_space) + return gym.spaces.Dict(ret) + else: + return self._convert_individual_space(input_observation_space) + + def __init__( + self, + input_observation_space: Optional[gym.Space] = None, + input_action_space: Optional[gym.Space] = None, + *, + num_frames: int = 1, + multi_agent: bool = False, + as_learner_connector: bool = False, + **kwargs, + ): + """Initializes a _FrameStackingConnector instance. + + Args: + num_frames: The number of observation frames to stack up (into a single + observation) for the RLModule's forward pass. + multi_agent: Whether this is a connector operating on a multi-agent + observation space mapping AgentIDs to individual agents' observations. + as_learner_connector: Whether this connector is part of a Learner connector + pipeline, as opposed to an env-to-module pipeline. + """ + super().__init__( + input_observation_space=input_observation_space, + input_action_space=input_action_space, + **kwargs, + ) + + self._multi_agent = multi_agent + self.num_frames = num_frames + self._as_learner_connector = as_learner_connector + + @override(ConnectorV2) + def __call__( + self, + *, + rl_module: RLModule, + batch: Dict[str, Any], + episodes: List[EpisodeType], + explore: Optional[bool] = None, + shared_data: Optional[dict] = None, + **kwargs, + ) -> Any: + # Learner connector pipeline. Episodes have been numpy'ized. + if self._as_learner_connector: + for sa_episode in self.single_agent_episode_iterator( + episodes, agents_that_stepped_only=False + ): + + def _map_fn(s, _sa_episode=sa_episode): + # Squeeze out last dim. + s = np.squeeze(s, axis=-1) + # Calculate new shape and strides + new_shape = (len(_sa_episode), self.num_frames) + s.shape[1:] + new_strides = (s.strides[0],) + s.strides + # Create a strided view of the array. + return np.transpose( + np.lib.stride_tricks.as_strided( + s, shape=new_shape, strides=new_strides + ), + axes=[0, 2, 3, 1], + ) + + # Get all observations from the episode in one np array (except for + # the very last one, which is the final observation not needed for + # learning). + self.add_n_batch_items( + batch=batch, + column=Columns.OBS, + items_to_add=tree.map_structure( + _map_fn, + sa_episode.get_observations( + indices=slice(-self.num_frames + 1, len(sa_episode)), + neg_index_as_lookback=True, + fill=0.0, + ), + ), + num_items=len(sa_episode), + single_agent_episode=sa_episode, + ) + + # Env-to-module pipeline. Episodes still operate on lists. + else: + for sa_episode in self.single_agent_episode_iterator(episodes): + assert not sa_episode.is_numpy + # Get the list of observations to stack. + obs_stack = sa_episode.get_observations( + indices=slice(-self.num_frames, None), + fill=0.0, + ) + # Observation components are (w, h, 1) + # -> concatenate along axis=-1 to (w, h, [num_frames]). + stacked_obs = tree.map_structure( + lambda *s: np.concatenate(s, axis=2), + *obs_stack, + ) + self.add_batch_item( + batch=batch, + column=Columns.OBS, + item_to_add=stacked_obs, + single_agent_episode=sa_episode, + ) + + return batch + + def _convert_individual_space(self, obs_space): + # Some assumptions: Space is box AND last dim (the stacking one) is 1. + assert isinstance(obs_space, gym.spaces.Box), obs_space + assert obs_space.shape[-1] == 1, obs_space + + return gym.spaces.Box( + low=np.repeat(obs_space.low, repeats=self.num_frames, axis=-1), + high=np.repeat(obs_space.high, repeats=self.num_frames, axis=-1), + shape=list(obs_space.shape)[:-1] + [self.num_frames], + dtype=obs_space.dtype, + ) diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/connectors/common/module_to_agent_unmapping.py b/.venv/lib/python3.11/site-packages/ray/rllib/connectors/common/module_to_agent_unmapping.py new file mode 100644 index 0000000000000000000000000000000000000000..fb3a2b1e954e98ee89a455c53d70304d2cd9b7f0 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/rllib/connectors/common/module_to_agent_unmapping.py @@ -0,0 +1,48 @@ +from collections import defaultdict +from typing import Any, Dict, List, Optional + +from ray.rllib.connectors.connector_v2 import ConnectorV2 +from ray.rllib.core.rl_module.rl_module import RLModule +from ray.rllib.env.multi_agent_episode import MultiAgentEpisode +from ray.rllib.utils.annotations import override +from ray.rllib.utils.typing import EpisodeType +from ray.util.annotations import PublicAPI + + +@PublicAPI(stability="alpha") +class ModuleToAgentUnmapping(ConnectorV2): + """Performs flipping of `data` from ModuleID- to AgentID based mapping. + + Before mapping: + data[module1] -> [col, e.g. ACTIONS] + -> [dict mapping episode-identifying tuples to lists of data] + data[module2] -> ... + + After mapping: + data[ACTIONS]: [dict mapping episode-identifying tuples to lists of data] + + Note that episode-identifying tuples have the form of: (episode_id,) in the + single-agent case and (ma_episode_id, agent_id, module_id) in the multi-agent + case. + """ + + @override(ConnectorV2) + def __call__( + self, + *, + rl_module: RLModule, + batch: Dict[str, Any], + episodes: List[EpisodeType], + explore: Optional[bool] = None, + shared_data: Optional[dict] = None, + **kwargs, + ) -> Any: + # This Connector should only be used in a multi-agent setting. + assert isinstance(episodes[0], MultiAgentEpisode) + + agent_data = defaultdict(dict) + for module_id, module_data in batch.items(): + for column, values_dict in module_data.items(): + agent_data[column].update(values_dict) + + return dict(agent_data) diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/connectors/common/numpy_to_tensor.py b/.venv/lib/python3.11/site-packages/ray/rllib/connectors/common/numpy_to_tensor.py new file mode 100644 index 0000000000000000000000000000000000000000..7c0123c44990ec52e991004c259f32070ce05e73 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/rllib/connectors/common/numpy_to_tensor.py @@ -0,0 +1,125 @@ +from typing import Any, Dict, List, Optional + +import gymnasium as gym + +from ray.rllib.connectors.connector_v2 import ConnectorV2 +from ray.rllib.core import DEFAULT_MODULE_ID +from ray.rllib.core.columns import Columns +from ray.rllib.core.rl_module.multi_rl_module import MultiRLModule +from ray.rllib.core.rl_module.rl_module import RLModule +from ray.rllib.utils.annotations import override +from ray.rllib.utils.torch_utils import convert_to_torch_tensor +from ray.rllib.utils.typing import EpisodeType +from ray.util.annotations import PublicAPI + + +@PublicAPI(stability="alpha") +class NumpyToTensor(ConnectorV2): + """Converts numpy arrays across the entire input data into (framework) tensors. + + The framework information is received via the provided `rl_module` arg in the + `__call__()` method. + + Note: This is one of the default env-to-module or Learner ConnectorV2 pieces that + are added automatically by RLlib into every env-to-module/Learner connector + pipeline, unless `config.add_default_connectors_to_env_to_module_pipeline` or + `config.add_default_connectors_to_learner_pipeline ` are set to + False. + + The default env-to-module connector pipeline is: + [ + [0 or more user defined ConnectorV2 pieces], + AddObservationsFromEpisodesToBatch, + AddTimeDimToBatchAndZeroPad, + AddStatesFromEpisodesToBatch, + AgentToModuleMapping, # only in multi-agent setups! + BatchIndividualItems, + NumpyToTensor, + ] + The default Learner connector pipeline is: + [ + [0 or more user defined ConnectorV2 pieces], + AddObservationsFromEpisodesToBatch, + AddColumnsFromEpisodesToTrainBatch, + AddTimeDimToBatchAndZeroPad, + AddStatesFromEpisodesToBatch, + AgentToModuleMapping, # only in multi-agent setups! + BatchIndividualItems, + NumpyToTensor, + ] + + This ConnectorV2: + - Loops through the input `data` and converts all found numpy arrays into + framework-specific tensors (possibly on a GPU). + """ + + def __init__( + self, + input_observation_space: Optional[gym.Space] = None, + input_action_space: Optional[gym.Space] = None, + *, + as_learner_connector: bool = False, + pin_mempory: Optional[bool] = None, + device: Optional[str] = None, + **kwargs, + ): + """Initializes a NumpyToTensor instance. + + Args: + as_learner_connector: Whether this ConnectorV2 piece is used inside a + LearnerConnectorPipeline or not. + pin_mempory: Whether to pin memory when creating (torch) tensors. + If None (default), pins memory if `as_learner_connector` is True, + otherwise doesn't pin memory. + device: An optional device to move the resulting tensors to. If not + provided, all data will be left on the CPU. + **kwargs: + """ + super().__init__( + input_observation_space=input_observation_space, + input_action_space=input_action_space, + **kwargs, + ) + self._as_learner_connector = as_learner_connector + self._pin_memory = ( + pin_mempory if pin_mempory is not None else self._as_learner_connector + ) + self._device = device + + @override(ConnectorV2) + def __call__( + self, + *, + rl_module: RLModule, + batch: Dict[str, Any], + episodes: List[EpisodeType], + explore: Optional[bool] = None, + shared_data: Optional[dict] = None, + **kwargs, + ) -> Any: + is_single_agent = False + is_multi_rl_module = isinstance(rl_module, MultiRLModule) + # `data` already a ModuleID to batch mapping format. + if not (is_multi_rl_module and all(c in rl_module._rl_modules for c in batch)): + is_single_agent = True + batch = {DEFAULT_MODULE_ID: batch} + + for module_id, module_data in batch.copy().items(): + infos = module_data.pop(Columns.INFOS, None) + if rl_module.framework == "torch": + module_data = convert_to_torch_tensor( + module_data, pin_memory=self._pin_memory, device=self._device + ) + else: + raise ValueError( + "`NumpyToTensor`does NOT support frameworks other than torch!" + ) + if infos is not None: + module_data[Columns.INFOS] = infos + # Early out with data under(!) `DEFAULT_MODULE_ID`, b/c we are in plain + # single-agent mode. + if is_single_agent: + return module_data + batch[module_id] = module_data + + return batch diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/connectors/common/tensor_to_numpy.py b/.venv/lib/python3.11/site-packages/ray/rllib/connectors/common/tensor_to_numpy.py new file mode 100644 index 0000000000000000000000000000000000000000..f6bbb2669c7942087e2e9fc1817a9dd3d48e1280 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/rllib/connectors/common/tensor_to_numpy.py @@ -0,0 +1,26 @@ +from typing import Any, Dict, List, Optional + +from ray.rllib.connectors.connector_v2 import ConnectorV2 +from ray.rllib.core.rl_module.rl_module import RLModule +from ray.rllib.utils.annotations import override +from ray.rllib.utils.numpy import convert_to_numpy +from ray.rllib.utils.typing import EpisodeType +from ray.util.annotations import PublicAPI + + +@PublicAPI(stability="alpha") +class TensorToNumpy(ConnectorV2): + """Converts (framework) tensors across the entire input data into numpy arrays.""" + + @override(ConnectorV2) + def __call__( + self, + *, + rl_module: RLModule, + batch: Dict[str, Any], + episodes: List[EpisodeType], + explore: Optional[bool] = None, + shared_data: Optional[dict] = None, + **kwargs, + ) -> Any: + return convert_to_numpy(batch) diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/connectors/env_to_module/__pycache__/__init__.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/connectors/env_to_module/__pycache__/__init__.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..715ac1ffbb62e4e943c15e8f782d16c1f812f7f6 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/connectors/env_to_module/__pycache__/__init__.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/connectors/env_to_module/__pycache__/env_to_module_pipeline.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/connectors/env_to_module/__pycache__/env_to_module_pipeline.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..e50b62079f6d25e852b7534c8f92c03d5852b986 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/connectors/env_to_module/__pycache__/env_to_module_pipeline.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/connectors/env_to_module/__pycache__/flatten_observations.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/connectors/env_to_module/__pycache__/flatten_observations.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..7462114c7996818b1e097a7d652225b1c1e48ed6 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/connectors/env_to_module/__pycache__/flatten_observations.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/connectors/env_to_module/__pycache__/frame_stacking.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/connectors/env_to_module/__pycache__/frame_stacking.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..461f918b04ff0006d7ead473a0805eb90a087d3b Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/connectors/env_to_module/__pycache__/frame_stacking.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/connectors/env_to_module/__pycache__/observation_preprocessor.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/connectors/env_to_module/__pycache__/observation_preprocessor.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..123cb6365fea6080141f671848a4ef62e844e83c Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/connectors/env_to_module/__pycache__/observation_preprocessor.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/connectors/env_to_module/__pycache__/write_observations_to_episodes.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/connectors/env_to_module/__pycache__/write_observations_to_episodes.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..fd1cfa696c9d1e6d41219d61cd44ec3249b54048 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/connectors/env_to_module/__pycache__/write_observations_to_episodes.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/connectors/learner/__pycache__/add_one_ts_to_episodes_and_truncate.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/connectors/learner/__pycache__/add_one_ts_to_episodes_and_truncate.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..9882468e2de537e0fd446fb838443e4261b98348 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/connectors/learner/__pycache__/add_one_ts_to_episodes_and_truncate.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/connectors/learner/__pycache__/frame_stacking.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/connectors/learner/__pycache__/frame_stacking.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..1f2d3bf38e10580e0b5a97b2f53ebd3745a7c5f8 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/connectors/learner/__pycache__/frame_stacking.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/connectors/learner/__pycache__/learner_connector_pipeline.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/connectors/learner/__pycache__/learner_connector_pipeline.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..bde8e25e06cbd9f44cf6fbbb11bccc61b5086b65 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/connectors/learner/__pycache__/learner_connector_pipeline.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/connectors/learner/add_columns_from_episodes_to_train_batch.py b/.venv/lib/python3.11/site-packages/ray/rllib/connectors/learner/add_columns_from_episodes_to_train_batch.py new file mode 100644 index 0000000000000000000000000000000000000000..6601b3b2011df21b870234e14f7654005bc9722d --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/rllib/connectors/learner/add_columns_from_episodes_to_train_batch.py @@ -0,0 +1,166 @@ +from typing import Any, Dict, List, Optional + +from ray.rllib.connectors.connector_v2 import ConnectorV2 +from ray.rllib.core.columns import Columns +from ray.rllib.core.rl_module.rl_module import RLModule +from ray.rllib.utils.annotations import override +from ray.rllib.utils.typing import EpisodeType +from ray.util.annotations import PublicAPI + + +@PublicAPI(stability="alpha") +class AddColumnsFromEpisodesToTrainBatch(ConnectorV2): + """Adds infos/actions/rewards/terminateds/... to train batch. + + Note: This is one of the default Learner ConnectorV2 pieces that are added + automatically by RLlib into every Learner connector pipeline, unless + `config.add_default_connectors_to_learner_pipeline` is set to False. + + The default Learner connector pipeline is: + [ + [0 or more user defined ConnectorV2 pieces], + AddObservationsFromEpisodesToBatch, + AddColumnsFromEpisodesToTrainBatch, + AddTimeDimToBatchAndZeroPad, + AddStatesFromEpisodesToBatch, + AgentToModuleMapping, # only in multi-agent setups! + BatchIndividualItems, + NumpyToTensor, + ] + + Does NOT add observations to train batch (these should have already been added + by another ConnectorV2 piece: `AddObservationsToTrainBatch` in the same pipeline). + + If provided with `episodes` data, this connector piece makes sure that the final + train batch going into the RLModule for updating (`forward_train()` call) contains + at the minimum: + - Observations: From all episodes under the Columns.OBS key. + - Actions, rewards, terminal/truncation flags: From all episodes under the + respective keys. + - All data inside the episodes' `extra_model_outs` property, e.g. action logp and + action probs under the respective keys. + - Internal states: These will NOT be added to the batch by this connector piece + as this functionality is handled by a different default connector piece: + `AddStatesFromEpisodesToBatch`. + + If the user wants to customize their own data under the given keys (e.g. obs, + actions, ...), they can extract from the episodes or recompute from `data` + their own data and store it in `data` under those keys. In this case, the default + connector will not change the data under these keys and simply act as a + pass-through. + """ + + @override(ConnectorV2) + def __call__( + self, + *, + rl_module: RLModule, + batch: Optional[Dict[str, Any]], + episodes: List[EpisodeType], + explore: Optional[bool] = None, + shared_data: Optional[dict] = None, + **kwargs, + ) -> Any: + # Infos. + if Columns.INFOS not in batch: + for sa_episode in self.single_agent_episode_iterator( + episodes, + agents_that_stepped_only=False, + ): + self.add_n_batch_items( + batch, + Columns.INFOS, + items_to_add=sa_episode.get_infos(slice(0, len(sa_episode))), + num_items=len(sa_episode), + single_agent_episode=sa_episode, + ) + + # Actions. + if Columns.ACTIONS not in batch: + for sa_episode in self.single_agent_episode_iterator( + episodes, + agents_that_stepped_only=False, + ): + self.add_n_batch_items( + batch, + Columns.ACTIONS, + items_to_add=[ + sa_episode.get_actions(indices=ts) + for ts in range(len(sa_episode)) + ], + num_items=len(sa_episode), + single_agent_episode=sa_episode, + ) + # Rewards. + if Columns.REWARDS not in batch: + for sa_episode in self.single_agent_episode_iterator( + episodes, + agents_that_stepped_only=False, + ): + self.add_n_batch_items( + batch, + Columns.REWARDS, + items_to_add=[ + sa_episode.get_rewards(indices=ts) + for ts in range(len(sa_episode)) + ], + num_items=len(sa_episode), + single_agent_episode=sa_episode, + ) + # Terminateds. + if Columns.TERMINATEDS not in batch: + for sa_episode in self.single_agent_episode_iterator( + episodes, + agents_that_stepped_only=False, + ): + self.add_n_batch_items( + batch, + Columns.TERMINATEDS, + items_to_add=( + [False] * (len(sa_episode) - 1) + [sa_episode.is_terminated] + if len(sa_episode) > 0 + else [] + ), + num_items=len(sa_episode), + single_agent_episode=sa_episode, + ) + # Truncateds. + if Columns.TRUNCATEDS not in batch: + for sa_episode in self.single_agent_episode_iterator( + episodes, + agents_that_stepped_only=False, + ): + self.add_n_batch_items( + batch, + Columns.TRUNCATEDS, + items_to_add=( + [False] * (len(sa_episode) - 1) + [sa_episode.is_truncated] + if len(sa_episode) > 0 + else [] + ), + num_items=len(sa_episode), + single_agent_episode=sa_episode, + ) + # Extra model outputs (except for STATE_OUT, which will be handled by another + # default connector piece). Also, like with all the fields above, skip + # those that the user already seemed to have populated via custom connector + # pieces. + skip_columns = set(batch.keys()) | {Columns.STATE_IN, Columns.STATE_OUT} + for sa_episode in self.single_agent_episode_iterator( + episodes, + agents_that_stepped_only=False, + ): + for column in sa_episode.extra_model_outputs.keys(): + if column not in skip_columns: + self.add_n_batch_items( + batch, + column, + items_to_add=[ + sa_episode.get_extra_model_outputs(key=column, indices=ts) + for ts in range(len(sa_episode)) + ], + num_items=len(sa_episode), + single_agent_episode=sa_episode, + ) + + return batch diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/connectors/learner/add_next_observations_from_episodes_to_train_batch.py b/.venv/lib/python3.11/site-packages/ray/rllib/connectors/learner/add_next_observations_from_episodes_to_train_batch.py new file mode 100644 index 0000000000000000000000000000000000000000..6efa3b706bf1f24e61f9791b273e7f11b08b5066 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/rllib/connectors/learner/add_next_observations_from_episodes_to_train_batch.py @@ -0,0 +1,103 @@ +from typing import Any, Dict, List, Optional + +from ray.rllib.core.columns import Columns +from ray.rllib.connectors.connector_v2 import ConnectorV2 +from ray.rllib.core.rl_module.rl_module import RLModule +from ray.rllib.utils.annotations import override +from ray.rllib.utils.typing import EpisodeType +from ray.util.annotations import PublicAPI + + +@PublicAPI(stability="alpha") +class AddNextObservationsFromEpisodesToTrainBatch(ConnectorV2): + """Adds the NEXT_OBS column with the correct episode observations to train batch. + + - Operates on a list of Episode objects. + - Gets all observation(s) from all the given episodes (except the very first ones) + and adds them to the batch under construction in the NEXT_OBS column (as a list of + individual observations). + - Does NOT alter any observations (or other data) in the given episodes. + - Can be used in Learner connector pipelines. + + .. testcode:: + + import gymnasium as gym + import numpy as np + + from ray.rllib.connectors.learner import ( + AddNextObservationsFromEpisodesToTrainBatch + ) + from ray.rllib.core.columns import Columns + from ray.rllib.env.single_agent_episode import SingleAgentEpisode + from ray.rllib.utils.test_utils import check + + # Create two dummy SingleAgentEpisodes, each containing 3 observations, + # 2 actions and 2 rewards (both episodes are length=2). + obs_space = gym.spaces.Box(-1.0, 1.0, (2,), np.float32) + act_space = gym.spaces.Discrete(2) + + episodes = [SingleAgentEpisode( + observations=[obs_space.sample(), obs_space.sample(), obs_space.sample()], + actions=[act_space.sample(), act_space.sample()], + rewards=[1.0, 2.0], + len_lookback_buffer=0, + ) for _ in range(2)] + eps_1_next_obses = episodes[0].get_observations([1, 2]) + eps_2_next_obses = episodes[1].get_observations([1, 2]) + print(f"1st Episode's next obses are {eps_1_next_obses}") + print(f"2nd Episode's next obses are {eps_2_next_obses}") + + # Create an instance of this class. + connector = AddNextObservationsFromEpisodesToTrainBatch() + + # Call the connector with the two created episodes. + # Note that this particular connector works without an RLModule, so we + # simplify here for the sake of this example. + output_data = connector( + rl_module=None, + batch={}, + episodes=episodes, + explore=True, + shared_data={}, + ) + # The output data should now contain the last observations of both episodes, + # in a "per-episode organized" fashion. + check( + output_data, + { + Columns.NEXT_OBS: { + (episodes[0].id_,): eps_1_next_obses, + (episodes[1].id_,): eps_2_next_obses, + }, + }, + ) + """ + + @override(ConnectorV2) + def __call__( + self, + *, + rl_module: RLModule, + batch: Dict[str, Any], + episodes: List[EpisodeType], + explore: Optional[bool] = None, + shared_data: Optional[dict] = None, + **kwargs, + ) -> Any: + # If "obs" already in `batch`, early out. + if Columns.NEXT_OBS in batch: + return batch + + for sa_episode in self.single_agent_episode_iterator( + # This is a Learner-only connector -> Get all episodes (for train batch). + episodes, + agents_that_stepped_only=False, + ): + self.add_n_batch_items( + batch, + Columns.NEXT_OBS, + items_to_add=sa_episode.get_observations(slice(1, len(sa_episode) + 1)), + num_items=len(sa_episode), + single_agent_episode=sa_episode, + ) + return batch diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/connectors/learner/add_one_ts_to_episodes_and_truncate.py b/.venv/lib/python3.11/site-packages/ray/rllib/connectors/learner/add_one_ts_to_episodes_and_truncate.py new file mode 100644 index 0000000000000000000000000000000000000000..fcd3703eeb855e1ab1fad584c1aa01b5f922f5cb --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/rllib/connectors/learner/add_one_ts_to_episodes_and_truncate.py @@ -0,0 +1,168 @@ +from typing import Any, Dict, List, Optional + +from ray.rllib.connectors.connector_v2 import ConnectorV2 +from ray.rllib.core.columns import Columns +from ray.rllib.core.rl_module.rl_module import RLModule +from ray.rllib.env.multi_agent_episode import MultiAgentEpisode +from ray.rllib.utils.annotations import override +from ray.rllib.utils.postprocessing.episodes import add_one_ts_to_episodes_and_truncate +from ray.rllib.utils.typing import EpisodeType +from ray.util.annotations import PublicAPI + + +@PublicAPI(stability="alpha") +class AddOneTsToEpisodesAndTruncate(ConnectorV2): + """Adds an artificial timestep to all incoming episodes at the end. + + In detail: The last observations, infos, actions, and all `extra_model_outputs` + will be duplicated and appended to each episode's data. An extra 0.0 reward + will be appended to the episode's rewards. The episode's timestep will be + increased by 1. Also, adds the truncated=True flag to each episode if the + episode is not already done (terminated or truncated). + + Useful for value function bootstrapping, where it is required to compute a + forward pass for the very last timestep within the episode, + i.e. using the following input dict: { + obs=[final obs], + state=[final state output], + prev. reward=[final reward], + etc.. + } + + .. testcode:: + + from ray.rllib.connectors.learner import AddOneTsToEpisodesAndTruncate + from ray.rllib.env.single_agent_episode import SingleAgentEpisode + from ray.rllib.utils.test_utils import check + + # Create 2 episodes (both to be extended by one timestep). + episode1 = SingleAgentEpisode( + observations=[0, 1, 2], + actions=[0, 1], + rewards=[0.0, 1.0], + terminated=False, + truncated=False, + len_lookback_buffer=0, + ).to_numpy() + check(len(episode1), 2) + check(episode1.is_truncated, False) + + episode2 = SingleAgentEpisode( + observations=[0, 1, 2, 3, 4, 5], + actions=[0, 1, 2, 3, 4], + rewards=[0.0, 1.0, 2.0, 3.0, 4.0], + terminated=True, # a terminated episode + truncated=False, + len_lookback_buffer=0, + ).to_numpy() + check(len(episode2), 5) + check(episode2.is_truncated, False) + check(episode2.is_terminated, True) + + # Create an instance of this class. + connector = AddOneTsToEpisodesAndTruncate() + + # Call the connector. + shared_data = {} + _ = connector( + rl_module=None, # Connector used here does not require RLModule. + batch={}, + episodes=[episode1, episode2], + shared_data=shared_data, + ) + # Check on the episodes. Both of them should now be 1 timestep longer. + check(len(episode1), 3) + check(episode1.is_truncated, True) + check(len(episode2), 6) + check(episode2.is_truncated, False) + check(episode2.is_terminated, True) + """ + + @override(ConnectorV2) + def __call__( + self, + *, + rl_module: RLModule, + batch: Dict[str, Any], + episodes: List[EpisodeType], + explore: Optional[bool] = None, + shared_data: Optional[dict] = None, + **kwargs, + ) -> Any: + # Build the loss mask to make sure the extra added timesteps do not influence + # the final loss and fix the terminateds and truncateds in the batch. + + # For proper v-trace execution, the rules must be as follows: + # Legend: + # T: terminal=True + # R: truncated=True + # B0: bootstrap with value 0 (also: terminal=True) + # Bx: bootstrap with some vf-computed value (also: terminal=True) + + # batch: - - - - - - - T B0- - - - - R Bx- - - - R Bx + # mask : t t t t t t t t f t t t t t t f t t t t t f + + # TODO (sven): Same situation as in TODO below, but for multi-agent episode. + # Maybe add a dedicated connector piece for this task? + # We extend the MultiAgentEpisode's ID by a running number here to make sure + # we treat each MAEpisode chunk as separate (for potentially upcoming v-trace + # and LSTM zero-padding) and don't mix data from different chunks. + if isinstance(episodes[0], MultiAgentEpisode): + for i, ma_episode in enumerate(episodes): + ma_episode.id_ += "_" + str(i) + # Also change the underlying single-agent episode's + # `multi_agent_episode_id` properties. + for sa_episode in ma_episode.agent_episodes.values(): + sa_episode.multi_agent_episode_id = ma_episode.id_ + + for i, sa_episode in enumerate( + self.single_agent_episode_iterator(episodes, agents_that_stepped_only=False) + ): + # TODO (sven): This is a little bit of a hack: By extending the Episode's + # ID, we make sure that each episode chunk in `episodes` is treated as a + # separate episode in the `self.add_n_batch_items` below. Some algos (e.g. + # APPO) may have >1 episode chunks from the same episode (same ID) in the + # training data, thus leading to a malformatted batch in case of + # RNN-triggered zero-padding of the train batch. + # For example, if e1 (id=a len=4) and e2 (id=a len=5) are two chunks of the + # same episode in `episodes`, the resulting batch would have an additional + # timestep in the middle of the episode's "row": + # { "obs": { + # ("a", <- eps ID): [0, 1, 2, 3 <- len=4, [additional 1 ts (bad)], + # 0, 1, 2, 3, 4 <- len=5, [additional 1 ts]] + # }} + sa_episode.id_ += "_" + str(i) + + len_ = len(sa_episode) + + # Extend all episodes by one ts. + add_one_ts_to_episodes_and_truncate([sa_episode]) + + loss_mask = [True for _ in range(len_)] + [False] + self.add_n_batch_items( + batch, + Columns.LOSS_MASK, + loss_mask, + len_ + 1, + sa_episode, + ) + + terminateds = ( + [False for _ in range(len_ - 1)] + + [bool(sa_episode.is_terminated)] + + [True] # extra timestep + ) + self.add_n_batch_items( + batch, + Columns.TERMINATEDS, + terminateds, + len_ + 1, + sa_episode, + ) + + # Signal to following connector pieces that the loss-mask which masks out + # invalid episode ts (for the extra added ts at the end) has already been + # added to `data`. + shared_data["_added_loss_mask_for_valid_episode_ts"] = True + + return batch diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/connectors/learner/compute_returns_to_go.py b/.venv/lib/python3.11/site-packages/ray/rllib/connectors/learner/compute_returns_to_go.py new file mode 100644 index 0000000000000000000000000000000000000000..d005b8c5accbc7eb80da11885d847e2c3a950584 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/rllib/connectors/learner/compute_returns_to_go.py @@ -0,0 +1,68 @@ +from typing import Any, List, Dict + +import scipy + +from ray.rllib.connectors.connector_v2 import ConnectorV2 +from ray.rllib.core.columns import Columns +from ray.rllib.core.rl_module.multi_rl_module import MultiRLModule +from ray.rllib.utils.typing import EpisodeType + + +class ComputeReturnsToGo(ConnectorV2): + """Learner ConnectorV2 piece computing discounted returns to go till end of episode. + + This ConnectorV2: + - Operates on a list of Episode objects (single- or multi-agent). + - Should be used only in the Learner pipeline as a preparation for an upcoming loss + computation that requires the discounted returns to go (until the end of the + episode). + - For each agent, for each episode and at each timestep, sums up the rewards + (discounted) until the end of the episode and assigns the results to a new + column: RETURNS_TO_GO in the batch. + """ + + def __init__( + self, + input_observation_space=None, + input_action_space=None, + *, + gamma, + ): + """Initializes a ComputeReturnsToGo instance. + + Args: + gamma: The discount factor gamma. + """ + super().__init__(input_observation_space, input_action_space) + self.gamma = gamma + + def __call__( + self, + *, + rl_module: MultiRLModule, + episodes: List[EpisodeType], + batch: Dict[str, Any], + **kwargs, + ): + for sa_episode in self.single_agent_episode_iterator( + episodes, agents_that_stepped_only=False + ): + # Reverse the rewards sequence. + rewards_reversed = sa_episode.get_rewards()[::-1] + # Use lfilter to compute the discounted cumulative sums. + discounted_cumsum_reversed = scipy.signal.lfilter( + [1], [1, -self.gamma], rewards_reversed + ) + # Reverse the result to get the correct order. + discounted_returns = discounted_cumsum_reversed[::-1] + + # Add the results to the batch under a new column: RETURNS_TO_GO. + self.add_n_batch_items( + batch=batch, + column=Columns.RETURNS_TO_GO, + items_to_add=discounted_returns, + num_items=len(sa_episode), + single_agent_episode=sa_episode, + ) + + return batch diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/connectors/learner/frame_stacking.py b/.venv/lib/python3.11/site-packages/ray/rllib/connectors/learner/frame_stacking.py new file mode 100644 index 0000000000000000000000000000000000000000..648c7146fc5f9f8d568065240a34387f04c67f81 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/rllib/connectors/learner/frame_stacking.py @@ -0,0 +1,6 @@ +from functools import partial + +from ray.rllib.connectors.common.frame_stacking import _FrameStacking + + +FrameStackingLearner = partial(_FrameStacking, as_learner_connector=True) diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/connectors/learner/general_advantage_estimation.py b/.venv/lib/python3.11/site-packages/ray/rllib/connectors/learner/general_advantage_estimation.py new file mode 100644 index 0000000000000000000000000000000000000000..cf99887328cf0b090680d0770d3d824d4bb9aed1 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/rllib/connectors/learner/general_advantage_estimation.py @@ -0,0 +1,196 @@ +from typing import Any, List, Dict + +import numpy as np + +from ray.rllib.connectors.connector_v2 import ConnectorV2 +from ray.rllib.connectors.common.numpy_to_tensor import NumpyToTensor +from ray.rllib.core.columns import Columns +from ray.rllib.core.rl_module.apis.value_function_api import ValueFunctionAPI +from ray.rllib.core.rl_module.multi_rl_module import MultiRLModule +from ray.rllib.evaluation.postprocessing import Postprocessing +from ray.rllib.utils.annotations import override +from ray.rllib.utils.numpy import convert_to_numpy +from ray.rllib.utils.postprocessing.value_predictions import compute_value_targets +from ray.rllib.utils.postprocessing.zero_padding import ( + split_and_zero_pad_n_episodes, + unpad_data_if_necessary, +) +from ray.rllib.utils.typing import EpisodeType + + +class GeneralAdvantageEstimation(ConnectorV2): + """Learner ConnectorV2 piece computing GAE advantages and value targets on episodes. + + This ConnectorV2: + - Operates on a list of Episode objects (single- or multi-agent). + - Should be used only in the Learner pipeline and as one of the last pieces (due + to the fact that it requires the batch for the value functions to be already + complete). + - Requires the incoming episodes to already be elongated by one artificial timestep + at the end (last obs, actions, states, etc.. repeated, last reward=0.0, etc..), + making it possible to combine the per-timestep value computations with the + necessary "bootstrap" value computations at the episode (chunk) truncation points. + The extra timestep should be added using the `ray.rllib.connectors.learner. + add_one_ts_to_episodes_and_truncate.AddOneTsToEpisodesAndTruncate` connector piece. + + The GAE computation is performed in an efficient way through using the arriving + `batch` as forward batch for the value function, extracting the bootstrap values + (at the artificially added time steos) and all other value predictions (all other + timesteps), performing GAE, and adding the results back into `batch` (under + Postprocessing.ADVANTAGES and Postprocessing.VALUE_TARGETS. + """ + + def __init__( + self, + input_observation_space=None, + input_action_space=None, + *, + gamma, + lambda_, + ): + """Initializes a GeneralAdvantageEstimation instance. + + Args: + gamma: The discount factor gamma. + lambda_: The lambda parameter for General Advantage Estimation (GAE). + Defines the exponential weight used between actually measured rewards + vs value function estimates over multiple time steps. Specifically, + `lambda_` balances short-term, low-variance estimates with longer-term, + high-variance returns. A `lambda_` or 0.0 makes the GAE rely only on + immediate rewards (and vf predictions from there on, reducing variance, + but increasing bias), while a `lambda_` of 1.0 only incorporates vf + predictions at the truncation points of the given episodes or episode + chunks (reducing bias but increasing variance). + """ + super().__init__(input_observation_space, input_action_space) + self.gamma = gamma + self.lambda_ = lambda_ + + # Internal numpy-to-tensor connector to translate GAE results (advantages and + # vf targets) into tensors. + self._numpy_to_tensor_connector = None + + @override(ConnectorV2) + def __call__( + self, + *, + rl_module: MultiRLModule, + episodes: List[EpisodeType], + batch: Dict[str, Any], + **kwargs, + ): + # Device to place all GAE result tensors (advantages and value targets) on. + device = None + + # Extract all single-agent episodes. + sa_episodes_list = list( + self.single_agent_episode_iterator(episodes, agents_that_stepped_only=False) + ) + # Perform the value nets' forward passes. + # TODO (sven): We need to check here in the pipeline already, whether a module + # should even be updated or not (which we usually do after(!) the Learner + # pipeline). This is an open TODO to move this filter into a connector as well. + # For now, we'll just check, whether `mid` is in batch and skip if it isn't. + vf_preds = rl_module.foreach_module( + func=lambda mid, module: ( + module.compute_values(batch[mid]) + if mid in batch and isinstance(module, ValueFunctionAPI) + else None + ), + return_dict=True, + ) + # Loop through all modules and perform each one's GAE computation. + for module_id, module_vf_preds in vf_preds.items(): + # Skip those outputs of RLModules that are not implementers of + # `ValueFunctionAPI`. + if module_vf_preds is None: + continue + + module = rl_module[module_id] + device = module_vf_preds.device + # Convert to numpy for the upcoming GAE computations. + module_vf_preds = convert_to_numpy(module_vf_preds) + + # Collect (single-agent) episode lengths for this particular module. + episode_lens = [ + len(e) for e in sa_episodes_list if e.module_id in [None, module_id] + ] + + # Remove all zero-padding again, if applicable, for the upcoming + # GAE computations. + module_vf_preds = unpad_data_if_necessary(episode_lens, module_vf_preds) + # Compute value targets. + module_value_targets = compute_value_targets( + values=module_vf_preds, + rewards=unpad_data_if_necessary( + episode_lens, + convert_to_numpy(batch[module_id][Columns.REWARDS]), + ), + terminateds=unpad_data_if_necessary( + episode_lens, + convert_to_numpy(batch[module_id][Columns.TERMINATEDS]), + ), + truncateds=unpad_data_if_necessary( + episode_lens, + convert_to_numpy(batch[module_id][Columns.TRUNCATEDS]), + ), + gamma=self.gamma, + lambda_=self.lambda_, + ) + assert module_value_targets.shape[0] == sum(episode_lens) + + module_advantages = module_value_targets - module_vf_preds + # Drop vf-preds, not needed in loss. Note that in the DefaultPPORLModule, + # vf-preds are recomputed with each `forward_train` call anyway to compute + # the vf loss. + # Standardize advantages (used for more stable and better weighted + # policy gradient computations). + module_advantages = (module_advantages - module_advantages.mean()) / max( + 1e-4, module_advantages.std() + ) + + # Zero-pad the new computations, if necessary. + if module.is_stateful(): + module_advantages = np.stack( + split_and_zero_pad_n_episodes( + module_advantages, + episode_lens=episode_lens, + max_seq_len=module.model_config["max_seq_len"], + ), + axis=0, + ) + module_value_targets = np.stack( + split_and_zero_pad_n_episodes( + module_value_targets, + episode_lens=episode_lens, + max_seq_len=module.model_config["max_seq_len"], + ), + axis=0, + ) + batch[module_id][Postprocessing.ADVANTAGES] = module_advantages + batch[module_id][Postprocessing.VALUE_TARGETS] = module_value_targets + + # Convert all GAE results to tensors. + if self._numpy_to_tensor_connector is None: + self._numpy_to_tensor_connector = NumpyToTensor( + as_learner_connector=True, device=device + ) + tensor_results = self._numpy_to_tensor_connector( + rl_module=rl_module, + batch={ + mid: { + Postprocessing.ADVANTAGES: module_batch[Postprocessing.ADVANTAGES], + Postprocessing.VALUE_TARGETS: ( + module_batch[Postprocessing.VALUE_TARGETS] + ), + } + for mid, module_batch in batch.items() + if vf_preds[mid] is not None + }, + episodes=episodes, + ) + # Move converted tensors back to `batch`. + for mid, module_batch in tensor_results.items(): + batch[mid].update(module_batch) + + return batch diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/connectors/learner/learner_connector_pipeline.py b/.venv/lib/python3.11/site-packages/ray/rllib/connectors/learner/learner_connector_pipeline.py new file mode 100644 index 0000000000000000000000000000000000000000..8129dad82e95791fa270a7f35ec6fd7cab36d303 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/rllib/connectors/learner/learner_connector_pipeline.py @@ -0,0 +1,56 @@ +from typing import Any, Dict, List, Optional +from ray.rllib.connectors.connector_pipeline_v2 import ConnectorPipelineV2 +from ray.rllib.core.rl_module.rl_module import RLModule +from ray.rllib.utils.annotations import override +from ray.rllib.utils.metrics import ( + ALL_MODULES, + LEARNER_CONNECTOR_SUM_EPISODES_LENGTH_IN, + LEARNER_CONNECTOR_SUM_EPISODES_LENGTH_OUT, +) +from ray.rllib.utils.metrics.metrics_logger import MetricsLogger +from ray.rllib.utils.typing import EpisodeType +from ray.util.annotations import PublicAPI + + +@PublicAPI(stability="alpha") +class LearnerConnectorPipeline(ConnectorPipelineV2): + @override(ConnectorPipelineV2) + def __call__( + self, + *, + rl_module: RLModule, + batch: Optional[Dict[str, Any]] = None, + episodes: List[EpisodeType], + explore: bool = False, + shared_data: Optional[dict] = None, + metrics: Optional[MetricsLogger] = None, + **kwargs, + ): + # Log the sum of lengths of all episodes incoming. + if metrics: + metrics.log_value( + (ALL_MODULES, LEARNER_CONNECTOR_SUM_EPISODES_LENGTH_IN), + sum(map(len, episodes)), + ) + + # Make sure user does not necessarily send initial input into this pipeline. + # Might just be empty and to be populated from `episodes`. + ret = super().__call__( + rl_module=rl_module, + batch=batch if batch is not None else {}, + episodes=episodes, + shared_data=shared_data if shared_data is not None else {}, + explore=explore, + metrics=metrics, + metrics_prefix_key=(ALL_MODULES,), + **kwargs, + ) + + # Log the sum of lengths of all episodes outgoing. + if metrics: + metrics.log_value( + (ALL_MODULES, LEARNER_CONNECTOR_SUM_EPISODES_LENGTH_OUT), + sum(map(len, episodes)), + ) + + return ret diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/models/__init__.py b/.venv/lib/python3.11/site-packages/ray/rllib/models/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..8d7091e208fd2108d5b0f42fbf0a7ec27b850f0f --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/rllib/models/__init__.py @@ -0,0 +1,12 @@ +from ray.rllib.models.action_dist import ActionDistribution +from ray.rllib.models.catalog import ModelCatalog, MODEL_DEFAULTS +from ray.rllib.models.modelv2 import ModelV2 +from ray.rllib.models.preprocessors import Preprocessor + +__all__ = [ + "ActionDistribution", + "ModelCatalog", + "ModelV2", + "Preprocessor", + "MODEL_DEFAULTS", +] diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/models/action_dist.py b/.venv/lib/python3.11/site-packages/ray/rllib/models/action_dist.py new file mode 100644 index 0000000000000000000000000000000000000000..1cacfdef60c5e76831a23cd53bc326141ebcc54e --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/rllib/models/action_dist.py @@ -0,0 +1,94 @@ +import numpy as np +import gymnasium as gym + +from ray.rllib.models.modelv2 import ModelV2 +from ray.rllib.utils.annotations import OldAPIStack +from ray.rllib.utils.typing import TensorType, List, Union, ModelConfigDict + + +@OldAPIStack +class ActionDistribution: + """The policy action distribution of an agent. + + Attributes: + inputs: input vector to compute samples from. + model (ModelV2): reference to model producing the inputs. + """ + + def __init__(self, inputs: List[TensorType], model: ModelV2): + """Initializes an ActionDist object. + + Args: + inputs: input vector to compute samples from. + model (ModelV2): reference to model producing the inputs. This + is mainly useful if you want to use model variables to compute + action outputs (i.e., for autoregressive action distributions, + see examples/autoregressive_action_dist.py). + """ + self.inputs = inputs + self.model = model + + def sample(self) -> TensorType: + """Draw a sample from the action distribution.""" + raise NotImplementedError + + def deterministic_sample(self) -> TensorType: + """ + Get the deterministic "sampling" output from the distribution. + This is usually the max likelihood output, i.e. mean for Normal, argmax + for Categorical, etc.. + """ + raise NotImplementedError + + def sampled_action_logp(self) -> TensorType: + """Returns the log probability of the last sampled action.""" + raise NotImplementedError + + def logp(self, x: TensorType) -> TensorType: + """The log-likelihood of the action distribution.""" + raise NotImplementedError + + def kl(self, other: "ActionDistribution") -> TensorType: + """The KL-divergence between two action distributions.""" + raise NotImplementedError + + def entropy(self) -> TensorType: + """The entropy of the action distribution.""" + raise NotImplementedError + + def multi_kl(self, other: "ActionDistribution") -> TensorType: + """The KL-divergence between two action distributions. + + This differs from kl() in that it can return an array for + MultiDiscrete. TODO(ekl) consider removing this. + """ + return self.kl(other) + + def multi_entropy(self) -> TensorType: + """The entropy of the action distribution. + + This differs from entropy() in that it can return an array for + MultiDiscrete. TODO(ekl) consider removing this. + """ + return self.entropy() + + @staticmethod + @OldAPIStack + def required_model_output_shape( + action_space: gym.Space, model_config: ModelConfigDict + ) -> Union[int, np.ndarray]: + """Returns the required shape of an input parameter tensor for a + particular action space and an optional dict of distribution-specific + options. + + Args: + action_space (gym.Space): The action space this distribution will + be used for, whose shape attributes will be used to determine + the required shape of the input parameter tensor. + model_config: Model's config dict (as defined in catalog.py) + + Returns: + model_output_shape (int or np.ndarray of ints): size of the + required input vector (minus leading batch dimension). + """ + raise NotImplementedError diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/models/catalog.py b/.venv/lib/python3.11/site-packages/ray/rllib/models/catalog.py new file mode 100644 index 0000000000000000000000000000000000000000..bff7ac243e6217037df6086e40330db3ad69c3eb --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/rllib/models/catalog.py @@ -0,0 +1,905 @@ +from functools import partial +import gymnasium as gym +from gymnasium.spaces import Box, Dict, Discrete, MultiDiscrete, Tuple +import logging +import numpy as np +import tree # pip install dm_tree +from typing import List, Optional, Type, Union + +from ray.tune.registry import ( + RLLIB_MODEL, + RLLIB_ACTION_DIST, + _global_registry, +) +from ray.rllib.models.action_dist import ActionDistribution +from ray.rllib.models.modelv2 import ModelV2 +from ray.rllib.models.preprocessors import get_preprocessor, Preprocessor +from ray.rllib.models.tf.tf_action_dist import ( + Categorical, + Deterministic, + DiagGaussian, + Dirichlet, + MultiActionDistribution, + MultiCategorical, +) +from ray.rllib.models.torch.torch_action_dist import ( + TorchCategorical, + TorchDeterministic, + TorchDirichlet, + TorchDiagGaussian, + TorchMultiActionDistribution, + TorchMultiCategorical, +) +from ray.rllib.utils.annotations import DeveloperAPI, PublicAPI +from ray.rllib.utils.deprecation import ( + DEPRECATED_VALUE, + deprecation_warning, +) +from ray.rllib.utils.error import UnsupportedSpaceException +from ray.rllib.utils.framework import try_import_tf, try_import_torch +from ray.rllib.utils.from_config import from_config +from ray.rllib.utils.spaces.simplex import Simplex +from ray.rllib.utils.spaces.space_utils import flatten_space +from ray.rllib.utils.typing import ModelConfigDict, TensorType + +tf1, tf, tfv = try_import_tf() +torch, _ = try_import_torch() + +logger = logging.getLogger(__name__) + +# fmt: off +# __sphinx_doc_begin__ +MODEL_DEFAULTS: ModelConfigDict = { + "fcnet_hiddens": [256, 256], + "fcnet_activation": "tanh", + "fcnet_weights_initializer": None, + "fcnet_weights_initializer_config": None, + "fcnet_bias_initializer": None, + "fcnet_bias_initializer_config": None, + "conv_filters": None, + "conv_activation": "relu", + "conv_kernel_initializer": None, + "conv_kernel_initializer_config": None, + "conv_bias_initializer": None, + "conv_bias_initializer_config": None, + "conv_transpose_kernel_initializer": None, + "conv_transpose_kernel_initializer_config": None, + "conv_transpose_bias_initializer": None, + "conv_transpose_bias_initializer_config": None, + "post_fcnet_hiddens": [], + "post_fcnet_activation": "relu", + "post_fcnet_weights_initializer": None, + "post_fcnet_weights_initializer_config": None, + "post_fcnet_bias_initializer": None, + "post_fcnet_bias_initializer_config": None, + "free_log_std": False, + "log_std_clip_param": 20.0, + "no_final_linear": False, + "vf_share_layers": True, + "use_lstm": False, + "max_seq_len": 20, + "lstm_cell_size": 256, + "lstm_use_prev_action": False, + "lstm_use_prev_reward": False, + "lstm_weights_initializer": None, + "lstm_weights_initializer_config": None, + "lstm_bias_initializer": None, + "lstm_bias_initializer_config": None, + "_time_major": False, + "use_attention": False, + "attention_num_transformer_units": 1, + "attention_dim": 64, + "attention_num_heads": 1, + "attention_head_dim": 32, + "attention_memory_inference": 50, + "attention_memory_training": 50, + "attention_position_wise_mlp_dim": 32, + "attention_init_gru_gate_bias": 2.0, + "attention_use_n_prev_actions": 0, + "attention_use_n_prev_rewards": 0, + "framestack": True, + "dim": 84, + "grayscale": False, + "zero_mean": True, + "custom_model": None, + "custom_model_config": {}, + "custom_action_dist": None, + "custom_preprocessor": None, + "encoder_latent_dim": None, + "always_check_shapes": False, + + # Deprecated keys: + "lstm_use_prev_action_reward": DEPRECATED_VALUE, + "_use_default_native_models": DEPRECATED_VALUE, + "_disable_preprocessor_api": False, + "_disable_action_flattening": False, +} +# __sphinx_doc_end__ +# fmt: on + + +@DeveloperAPI +class ModelCatalog: + """Registry of models, preprocessors, and action distributions for envs. + + .. testcode:: + :skipif: True + + prep = ModelCatalog.get_preprocessor(env) + observation = prep.transform(raw_observation) + + dist_class, dist_dim = ModelCatalog.get_action_dist( + env.action_space, {}) + model = ModelCatalog.get_model_v2( + obs_space, action_space, num_outputs, options) + dist = dist_class(model.outputs, model) + action = dist.sample() + """ + + @staticmethod + @DeveloperAPI + def get_action_dist( + action_space: gym.Space, + config: ModelConfigDict, + dist_type: Optional[Union[str, Type[ActionDistribution]]] = None, + framework: str = "tf", + **kwargs + ) -> (type, int): + """Returns a distribution class and size for the given action space. + + Args: + action_space: Action space of the target gym env. + config (Optional[dict]): Optional model config. + dist_type (Optional[Union[str, Type[ActionDistribution]]]): + Identifier of the action distribution (str) interpreted as a + hint or the actual ActionDistribution class to use. + framework: One of "tf2", "tf", "torch", or "jax". + kwargs: Optional kwargs to pass on to the Distribution's + constructor. + + Returns: + Tuple: + - dist_class (ActionDistribution): Python class of the + distribution. + - dist_dim (int): The size of the input vector to the + distribution. + """ + + dist_cls = None + config = config or MODEL_DEFAULTS + # Custom distribution given. + if config.get("custom_action_dist"): + custom_action_config = config.copy() + action_dist_name = custom_action_config.pop("custom_action_dist") + logger.debug("Using custom action distribution {}".format(action_dist_name)) + dist_cls = _global_registry.get(RLLIB_ACTION_DIST, action_dist_name) + return ModelCatalog._get_multi_action_distribution( + dist_cls, action_space, custom_action_config, framework + ) + + # Dist_type is given directly as a class. + elif ( + type(dist_type) is type + and issubclass(dist_type, ActionDistribution) + and dist_type not in (MultiActionDistribution, TorchMultiActionDistribution) + ): + dist_cls = dist_type + # Box space -> DiagGaussian OR Deterministic. + elif isinstance(action_space, Box): + if action_space.dtype.name.startswith("int"): + low_ = np.min(action_space.low) + high_ = np.max(action_space.high) + dist_cls = ( + TorchMultiCategorical if framework == "torch" else MultiCategorical + ) + num_cats = int(np.prod(action_space.shape)) + return ( + partial( + dist_cls, + input_lens=[high_ - low_ + 1 for _ in range(num_cats)], + action_space=action_space, + ), + num_cats * (high_ - low_ + 1), + ) + else: + if len(action_space.shape) > 1: + raise UnsupportedSpaceException( + "Action space has multiple dimensions " + "{}. ".format(action_space.shape) + + "Consider reshaping this into a single dimension, " + "using a custom action distribution, " + "using a Tuple action space, or the multi-agent API." + ) + # TODO(sven): Check for bounds and return SquashedNormal, etc.. + if dist_type is None: + return ( + partial( + TorchDiagGaussian if framework == "torch" else DiagGaussian, + action_space=action_space, + ), + DiagGaussian.required_model_output_shape(action_space, config), + ) + elif dist_type == "deterministic": + dist_cls = ( + TorchDeterministic if framework == "torch" else Deterministic + ) + # Discrete Space -> Categorical. + elif isinstance(action_space, Discrete): + if framework == "torch": + dist_cls = TorchCategorical + elif framework == "jax": + from ray.rllib.models.jax.jax_action_dist import JAXCategorical + + dist_cls = JAXCategorical + else: + dist_cls = Categorical + # Tuple/Dict Spaces -> MultiAction. + elif dist_type in ( + MultiActionDistribution, + TorchMultiActionDistribution, + ) or isinstance(action_space, (Tuple, Dict)): + return ModelCatalog._get_multi_action_distribution( + ( + MultiActionDistribution + if framework == "tf" + else TorchMultiActionDistribution + ), + action_space, + config, + framework, + ) + # Simplex -> Dirichlet. + elif isinstance(action_space, Simplex): + dist_cls = TorchDirichlet if framework == "torch" else Dirichlet + # MultiDiscrete -> MultiCategorical. + elif isinstance(action_space, MultiDiscrete): + dist_cls = ( + TorchMultiCategorical if framework == "torch" else MultiCategorical + ) + return partial(dist_cls, input_lens=action_space.nvec), int( + sum(action_space.nvec) + ) + # Unknown type -> Error. + else: + raise NotImplementedError( + "Unsupported args: {} {}".format(action_space, dist_type) + ) + + return dist_cls, int(dist_cls.required_model_output_shape(action_space, config)) + + @staticmethod + @DeveloperAPI + def get_action_shape( + action_space: gym.Space, framework: str = "tf" + ) -> (np.dtype, List[int]): + """Returns action tensor dtype and shape for the action space. + + Args: + action_space: Action space of the target gym env. + framework: The framework identifier. One of "tf" or "torch". + + Returns: + (dtype, shape): Dtype and shape of the actions tensor. + """ + dl_lib = torch if framework == "torch" else tf + if isinstance(action_space, Discrete): + return action_space.dtype, (None,) + elif isinstance(action_space, (Box, Simplex)): + if np.issubdtype(action_space.dtype, np.floating): + return dl_lib.float32, (None,) + action_space.shape + elif np.issubdtype(action_space.dtype, np.integer): + return dl_lib.int32, (None,) + action_space.shape + else: + raise ValueError("RLlib doesn't support non int or float box spaces") + elif isinstance(action_space, MultiDiscrete): + return action_space.dtype, (None,) + action_space.shape + elif isinstance(action_space, (Tuple, Dict)): + flat_action_space = flatten_space(action_space) + size = 0 + all_discrete = True + for i in range(len(flat_action_space)): + if isinstance(flat_action_space[i], Discrete): + size += 1 + else: + all_discrete = False + size += np.prod(flat_action_space[i].shape) + size = int(size) + return dl_lib.int32 if all_discrete else dl_lib.float32, (None, size) + else: + raise NotImplementedError( + "Action space {} not supported".format(action_space) + ) + + @staticmethod + @DeveloperAPI + def get_action_placeholder( + action_space: gym.Space, name: str = "action" + ) -> TensorType: + """Returns an action placeholder consistent with the action space + + Args: + action_space: Action space of the target gym env. + name: An optional string to name the placeholder by. + Default: "action". + + Returns: + action_placeholder: A placeholder for the actions + """ + dtype, shape = ModelCatalog.get_action_shape(action_space, framework="tf") + + return tf1.placeholder(dtype, shape=shape, name=name) + + @staticmethod + @DeveloperAPI + def get_model_v2( + obs_space: gym.Space, + action_space: gym.Space, + num_outputs: int, + model_config: ModelConfigDict, + framework: str = "tf", + name: str = "default_model", + model_interface: type = None, + default_model: type = None, + **model_kwargs + ) -> ModelV2: + """Returns a suitable model compatible with given spaces and output. + + Args: + obs_space: Observation space of the target gym env. This + may have an `original_space` attribute that specifies how to + unflatten the tensor into a ragged tensor. + action_space: Action space of the target gym env. + num_outputs: The size of the output vector of the model. + model_config: The "model" sub-config dict + within the Algorithm's config dict. + framework: One of "tf2", "tf", "torch", or "jax". + name: Name (scope) for the model. + model_interface: Interface required for the model + default_model: Override the default class for the model. This + only has an effect when not using a custom model + model_kwargs: Args to pass to the ModelV2 constructor + + Returns: + model (ModelV2): Model to use for the policy. + """ + + # Validate the given config dict. + ModelCatalog._validate_config( + config=model_config, action_space=action_space, framework=framework + ) + + if model_config.get("custom_model"): + # Allow model kwargs to be overridden / augmented by + # custom_model_config. + customized_model_kwargs = dict( + model_kwargs, **model_config.get("custom_model_config", {}) + ) + + if isinstance(model_config["custom_model"], type): + model_cls = model_config["custom_model"] + elif ( + isinstance(model_config["custom_model"], str) + and "." in model_config["custom_model"] + ): + return from_config( + cls=model_config["custom_model"], + obs_space=obs_space, + action_space=action_space, + num_outputs=num_outputs, + model_config=customized_model_kwargs, + name=name, + ) + else: + model_cls = _global_registry.get( + RLLIB_MODEL, model_config["custom_model"] + ) + + # Only allow ModelV2 or native keras Models. + if not issubclass(model_cls, ModelV2): + if framework not in ["tf", "tf2"] or not issubclass( + model_cls, tf.keras.Model + ): + raise ValueError( + "`model_cls` must be a ModelV2 sub-class, but is" + " {}!".format(model_cls) + ) + + logger.info("Wrapping {} as {}".format(model_cls, model_interface)) + model_cls = ModelCatalog._wrap_if_needed(model_cls, model_interface) + + if framework in ["tf2", "tf"]: + # Try wrapping custom model with LSTM/attention, if required. + if model_config.get("use_lstm") or model_config.get("use_attention"): + from ray.rllib.models.tf.attention_net import ( + AttentionWrapper, + ) + from ray.rllib.models.tf.recurrent_net import ( + LSTMWrapper, + ) + + wrapped_cls = model_cls + forward = wrapped_cls.forward + model_cls = ModelCatalog._wrap_if_needed( + wrapped_cls, + LSTMWrapper + if model_config.get("use_lstm") + else AttentionWrapper, + ) + model_cls._wrapped_forward = forward + + # Obsolete: Track and warn if vars were created but not + # registered. Only still do this, if users do register their + # variables. If not (which they shouldn't), don't check here. + created = set() + + def track_var_creation(next_creator, **kw): + v = next_creator(**kw) + created.add(v.ref()) + return v + + with tf.variable_creator_scope(track_var_creation): + if issubclass(model_cls, tf.keras.Model): + instance = model_cls( + input_space=obs_space, + action_space=action_space, + num_outputs=num_outputs, + name=name, + **customized_model_kwargs, + ) + else: + # Try calling with kwargs first (custom ModelV2 should + # accept these as kwargs, not get them from + # config["custom_model_config"] anymore). + try: + instance = model_cls( + obs_space, + action_space, + num_outputs, + model_config, + name, + **customized_model_kwargs, + ) + except TypeError as e: + # Keyword error: Try old way w/o kwargs. + if "__init__() got an unexpected " in e.args[0]: + instance = model_cls( + obs_space, + action_space, + num_outputs, + model_config, + name, + **model_kwargs, + ) + logger.warning( + "Custom ModelV2 should accept all custom " + "options as **kwargs, instead of expecting" + " them in config['custom_model_config']!" + ) + # Other error -> re-raise. + else: + raise e + + # User still registered TFModelV2's variables: Check, whether + # ok. + registered = [] + if not isinstance(instance, tf.keras.Model): + registered = set(instance.var_list) + if len(registered) > 0: + not_registered = set() + for var in created: + if var not in registered: + not_registered.add(var) + if not_registered: + raise ValueError( + "It looks like you are still using " + "`{}.register_variables()` to register your " + "model's weights. This is no longer required, but " + "if you are still calling this method at least " + "once, you must make sure to register all created " + "variables properly. The missing variables are {}," + " and you only registered {}. " + "Did you forget to call `register_variables()` on " + "some of the variables in question?".format( + instance, not_registered, registered + ) + ) + elif framework == "torch": + # Try wrapping custom model with LSTM/attention, if required. + if model_config.get("use_lstm") or model_config.get("use_attention"): + from ray.rllib.models.torch.attention_net import AttentionWrapper + from ray.rllib.models.torch.recurrent_net import LSTMWrapper + + wrapped_cls = model_cls + forward = wrapped_cls.forward + model_cls = ModelCatalog._wrap_if_needed( + wrapped_cls, + LSTMWrapper + if model_config.get("use_lstm") + else AttentionWrapper, + ) + model_cls._wrapped_forward = forward + + # PyTorch automatically tracks nn.Modules inside the parent + # nn.Module's constructor. + # Try calling with kwargs first (custom ModelV2 should + # accept these as kwargs, not get them from + # config["custom_model_config"] anymore). + try: + instance = model_cls( + obs_space, + action_space, + num_outputs, + model_config, + name, + **customized_model_kwargs, + ) + except TypeError as e: + # Keyword error: Try old way w/o kwargs. + if "__init__() got an unexpected " in e.args[0]: + instance = model_cls( + obs_space, + action_space, + num_outputs, + model_config, + name, + **model_kwargs, + ) + logger.warning( + "Custom ModelV2 should accept all custom " + "options as **kwargs, instead of expecting" + " them in config['custom_model_config']!" + ) + # Other error -> re-raise. + else: + raise e + else: + raise NotImplementedError( + "`framework` must be 'tf2|tf|torch', but is " + "{}!".format(framework) + ) + + return instance + + # Find a default TFModelV2 and wrap with model_interface. + if framework in ["tf", "tf2"]: + v2_class = None + # Try to get a default v2 model. + if not model_config.get("custom_model"): + v2_class = default_model or ModelCatalog._get_v2_model_class( + obs_space, model_config, framework=framework + ) + + if not v2_class: + raise ValueError("ModelV2 class could not be determined!") + + if model_config.get("use_lstm") or model_config.get("use_attention"): + from ray.rllib.models.tf.attention_net import ( + AttentionWrapper, + ) + from ray.rllib.models.tf.recurrent_net import ( + LSTMWrapper, + ) + + wrapped_cls = v2_class + if model_config.get("use_lstm"): + v2_class = ModelCatalog._wrap_if_needed(wrapped_cls, LSTMWrapper) + v2_class._wrapped_forward = wrapped_cls.forward + else: + v2_class = ModelCatalog._wrap_if_needed( + wrapped_cls, AttentionWrapper + ) + v2_class._wrapped_forward = wrapped_cls.forward + + # Wrap in the requested interface. + wrapper = ModelCatalog._wrap_if_needed(v2_class, model_interface) + + if issubclass(wrapper, tf.keras.Model): + model = wrapper( + input_space=obs_space, + action_space=action_space, + num_outputs=num_outputs, + name=name, + **dict(model_kwargs, **model_config), + ) + return model + + return wrapper( + obs_space, action_space, num_outputs, model_config, name, **model_kwargs + ) + + # Find a default TorchModelV2 and wrap with model_interface. + elif framework == "torch": + # Try to get a default v2 model. + if not model_config.get("custom_model"): + v2_class = default_model or ModelCatalog._get_v2_model_class( + obs_space, model_config, framework=framework + ) + + if not v2_class: + raise ValueError("ModelV2 class could not be determined!") + + if model_config.get("use_lstm") or model_config.get("use_attention"): + from ray.rllib.models.torch.attention_net import AttentionWrapper + from ray.rllib.models.torch.recurrent_net import LSTMWrapper + + wrapped_cls = v2_class + forward = wrapped_cls.forward + if model_config.get("use_lstm"): + v2_class = ModelCatalog._wrap_if_needed(wrapped_cls, LSTMWrapper) + else: + v2_class = ModelCatalog._wrap_if_needed( + wrapped_cls, AttentionWrapper + ) + + v2_class._wrapped_forward = forward + + # Wrap in the requested interface. + wrapper = ModelCatalog._wrap_if_needed(v2_class, model_interface) + return wrapper( + obs_space, action_space, num_outputs, model_config, name, **model_kwargs + ) + + # Find a default JAXModelV2 and wrap with model_interface. + elif framework == "jax": + v2_class = default_model or ModelCatalog._get_v2_model_class( + obs_space, model_config, framework=framework + ) + # Wrap in the requested interface. + wrapper = ModelCatalog._wrap_if_needed(v2_class, model_interface) + return wrapper( + obs_space, action_space, num_outputs, model_config, name, **model_kwargs + ) + else: + raise NotImplementedError( + "`framework` must be 'tf2|tf|torch', but is " "{}!".format(framework) + ) + + @staticmethod + @DeveloperAPI + def get_preprocessor( + env: gym.Env, options: Optional[dict] = None, include_multi_binary: bool = False + ) -> Preprocessor: + """Returns a suitable preprocessor for the given env. + + This is a wrapper for get_preprocessor_for_space(). + """ + + return ModelCatalog.get_preprocessor_for_space( + env.observation_space, options, include_multi_binary + ) + + @staticmethod + @DeveloperAPI + def get_preprocessor_for_space( + observation_space: gym.Space, + options: dict = None, + include_multi_binary: bool = False, + ) -> Preprocessor: + """Returns a suitable preprocessor for the given observation space. + + Args: + observation_space: The input observation space. + options: Options to pass to the preprocessor. + include_multi_binary: Whether to include the MultiBinaryPreprocessor in + the possible preprocessors returned by this method. + + Returns: + preprocessor: Preprocessor for the observations. + """ + + options = options or MODEL_DEFAULTS + for k in options.keys(): + if k not in MODEL_DEFAULTS: + raise Exception( + "Unknown config key `{}`, all keys: {}".format( + k, list(MODEL_DEFAULTS) + ) + ) + + cls = get_preprocessor( + observation_space, include_multi_binary=include_multi_binary + ) + prep = cls(observation_space, options) + + if prep is not None: + logger.debug( + "Created preprocessor {}: {} -> {}".format( + prep, observation_space, prep.shape + ) + ) + return prep + + @staticmethod + @PublicAPI + def register_custom_model(model_name: str, model_class: type) -> None: + """Register a custom model class by name. + + The model can be later used by specifying {"custom_model": model_name} + in the model config. + + Args: + model_name: Name to register the model under. + model_class: Python class of the model. + """ + if tf is not None: + if issubclass(model_class, tf.keras.Model): + deprecation_warning(old="register_custom_model", error=False) + _global_registry.register(RLLIB_MODEL, model_name, model_class) + + @staticmethod + @PublicAPI + def register_custom_action_dist( + action_dist_name: str, action_dist_class: type + ) -> None: + """Register a custom action distribution class by name. + + The model can be later used by specifying + {"custom_action_dist": action_dist_name} in the model config. + + Args: + model_name: Name to register the action distribution under. + model_class: Python class of the action distribution. + """ + _global_registry.register( + RLLIB_ACTION_DIST, action_dist_name, action_dist_class + ) + + @staticmethod + def _wrap_if_needed(model_cls: type, model_interface: type) -> type: + if not model_interface or issubclass(model_cls, model_interface): + return model_cls + + assert issubclass(model_cls, ModelV2), model_cls + + class wrapper(model_interface, model_cls): + pass + + name = "{}_as_{}".format(model_cls.__name__, model_interface.__name__) + wrapper.__name__ = name + wrapper.__qualname__ = name + + return wrapper + + @staticmethod + def _get_v2_model_class( + input_space: gym.Space, model_config: ModelConfigDict, framework: str = "tf" + ) -> Type[ModelV2]: + VisionNet = None + ComplexNet = None + + if framework in ["tf2", "tf"]: + from ray.rllib.models.tf.fcnet import ( + FullyConnectedNetwork as FCNet, + ) + from ray.rllib.models.tf.visionnet import ( + VisionNetwork as VisionNet, + ) + from ray.rllib.models.tf.complex_input_net import ( + ComplexInputNetwork as ComplexNet, + ) + elif framework == "torch": + from ray.rllib.models.torch.fcnet import FullyConnectedNetwork as FCNet + from ray.rllib.models.torch.visionnet import VisionNetwork as VisionNet + from ray.rllib.models.torch.complex_input_net import ( + ComplexInputNetwork as ComplexNet, + ) + elif framework == "jax": + from ray.rllib.models.jax.fcnet import FullyConnectedNetwork as FCNet + else: + raise ValueError( + "framework={} not supported in `ModelCatalog._get_v2_model_" + "class`!".format(framework) + ) + + orig_space = ( + input_space + if not hasattr(input_space, "original_space") + else input_space.original_space + ) + + # `input_space` is 3D Box -> VisionNet. + if isinstance(input_space, Box) and len(input_space.shape) == 3: + if framework == "jax": + raise NotImplementedError("No non-FC default net for JAX yet!") + return VisionNet + # `input_space` is 1D Box -> FCNet. + elif ( + isinstance(input_space, Box) + and len(input_space.shape) == 1 + and ( + not isinstance(orig_space, (Dict, Tuple)) + or not any( + isinstance(s, Box) and len(s.shape) >= 2 + for s in flatten_space(orig_space) + ) + ) + ): + return FCNet + # Complex (Dict, Tuple, 2D Box (flatten), Discrete, MultiDiscrete). + else: + if framework == "jax": + raise NotImplementedError("No non-FC default net for JAX yet!") + return ComplexNet + + @staticmethod + def _get_multi_action_distribution(dist_class, action_space, config, framework): + # In case the custom distribution is a child of MultiActionDistr. + # If users want to completely ignore the suggested child + # distributions, they should simply do so in their custom class' + # constructor. + if issubclass( + dist_class, (MultiActionDistribution, TorchMultiActionDistribution) + ): + flat_action_space = flatten_space(action_space) + child_dists_and_in_lens = tree.map_structure( + lambda s: ModelCatalog.get_action_dist(s, config, framework=framework), + flat_action_space, + ) + child_dists = [e[0] for e in child_dists_and_in_lens] + input_lens = [int(e[1]) for e in child_dists_and_in_lens] + return ( + partial( + dist_class, + action_space=action_space, + child_distributions=child_dists, + input_lens=input_lens, + ), + int(sum(input_lens)), + ) + return dist_class, dist_class.required_model_output_shape(action_space, config) + + @staticmethod + def _validate_config( + config: ModelConfigDict, action_space: gym.spaces.Space, framework: str + ) -> None: + """Validates a given model config dict. + + Args: + config: The "model" sub-config dict + within the Algorithm's config dict. + action_space: The action space of the model, whose config are + validated. + framework: One of "jax", "tf2", "tf", or "torch". + + Raises: + ValueError: If something is wrong with the given config. + """ + # Soft-deprecate custom preprocessors. + if config.get("custom_preprocessor") is not None: + deprecation_warning( + old="model.custom_preprocessor", + new="gym.ObservationWrapper around your env or handle complex " + "inputs inside your Model", + error=True, + ) + + if config.get("use_attention") and config.get("use_lstm"): + raise ValueError( + "Only one of `use_lstm` or `use_attention` may be set to True!" + ) + + # For complex action spaces, only allow prev action inputs to + # LSTMs and attention nets iff `_disable_action_flattening=True`. + # TODO: `_disable_action_flattening=True` will be the default in + # the future. + if ( + ( + config.get("lstm_use_prev_action") + or config.get("attention_use_n_prev_actions", 0) > 0 + ) + and not config.get("_disable_action_flattening") + and isinstance(action_space, (Tuple, Dict)) + ): + raise ValueError( + "For your complex action space (Tuple|Dict) and your model's " + "`prev-actions` setup of your model, you must set " + "`_disable_action_flattening=True` in your main config dict!" + ) + + if framework == "jax": + if config.get("use_attention"): + raise ValueError( + "`use_attention` not available for framework=jax so far!" + ) + elif config.get("use_lstm"): + raise ValueError("`use_lstm` not available for framework=jax so far!") diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/models/distributions.py b/.venv/lib/python3.11/site-packages/ray/rllib/models/distributions.py new file mode 100644 index 0000000000000000000000000000000000000000..bda55acd27702f940fac46ad7835f4b91c112f53 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/rllib/models/distributions.py @@ -0,0 +1,248 @@ +"""This is the next version of action distribution base class.""" +from typing import Tuple +import gymnasium as gym +import abc + +from ray.rllib.utils.annotations import ExperimentalAPI +from ray.rllib.utils.typing import TensorType, Union +from ray.rllib.utils.annotations import override + + +@ExperimentalAPI +class Distribution(abc.ABC): + """The base class for distribution over a random variable. + + Examples: + + .. testcode:: + + import torch + from ray.rllib.core.models.configs import MLPHeadConfig + from ray.rllib.models.torch.torch_distributions import TorchCategorical + + model = MLPHeadConfig(input_dims=[1]).build(framework="torch") + + # Create an action distribution from model logits + action_logits = model(torch.Tensor([[1]])) + action_dist = TorchCategorical.from_logits(action_logits) + action = action_dist.sample() + + # Create another distribution from a dummy Tensor + action_dist2 = TorchCategorical.from_logits(torch.Tensor([0])) + + # Compute some common metrics + logp = action_dist.logp(action) + kl = action_dist.kl(action_dist2) + entropy = action_dist.entropy() + """ + + @abc.abstractmethod + def sample( + self, + *, + sample_shape: Tuple[int, ...] = None, + return_logp: bool = False, + **kwargs, + ) -> Union[TensorType, Tuple[TensorType, TensorType]]: + """Draw a sample from the distribution. + + Args: + sample_shape: The shape of the sample to draw. + return_logp: Whether to return the logp of the sampled values. + **kwargs: Forward compatibility placeholder. + + Returns: + The sampled values. If return_logp is True, returns a tuple of the + sampled values and its logp. + """ + + @abc.abstractmethod + def rsample( + self, + *, + sample_shape: Tuple[int, ...] = None, + return_logp: bool = False, + **kwargs, + ) -> Union[TensorType, Tuple[TensorType, TensorType]]: + """Draw a re-parameterized sample from the action distribution. + + If this method is implemented, we can take gradients of samples w.r.t. the + distribution parameters. + + Args: + sample_shape: The shape of the sample to draw. + return_logp: Whether to return the logp of the sampled values. + **kwargs: Forward compatibility placeholder. + + Returns: + The sampled values. If return_logp is True, returns a tuple of the + sampled values and its logp. + """ + + @abc.abstractmethod + def logp(self, value: TensorType, **kwargs) -> TensorType: + """The log-likelihood of the distribution computed at `value` + + Args: + value: The value to compute the log-likelihood at. + **kwargs: Forward compatibility placeholder. + + Returns: + The log-likelihood of the value. + """ + + @abc.abstractmethod + def kl(self, other: "Distribution", **kwargs) -> TensorType: + """The KL-divergence between two distributions. + + Args: + other: The other distribution. + **kwargs: Forward compatibility placeholder. + + Returns: + The KL-divergence between the two distributions. + """ + + @abc.abstractmethod + def entropy(self, **kwargs) -> TensorType: + """The entropy of the distribution. + + Args: + **kwargs: Forward compatibility placeholder. + + Returns: + The entropy of the distribution. + """ + + @staticmethod + @abc.abstractmethod + def required_input_dim(space: gym.Space, **kwargs) -> int: + """Returns the required length of an input parameter tensor. + + Args: + space: The space this distribution will be used for, + whose shape attributes will be used to determine the required shape of + the input parameter tensor. + **kwargs: Forward compatibility placeholder. + + Returns: + size of the required input vector (minus leading batch dimension). + """ + + @classmethod + def from_logits(cls, logits: TensorType, **kwargs) -> "Distribution": + """Creates a Distribution from logits. + + The caller does not need to have knowledge of the distribution class in order + to create it and sample from it. The passed batched logits vectors might be + split up and are passed to the distribution class' constructor as kwargs. + + Args: + logits: The logits to create the distribution from. + **kwargs: Forward compatibility placeholder. + + Returns: + The created distribution. + + .. testcode:: + + import numpy as np + from ray.rllib.models.distributions import Distribution + + class Uniform(Distribution): + def __init__(self, lower, upper): + self.lower = lower + self.upper = upper + + def sample(self): + return self.lower + (self.upper - self.lower) * np.random.rand() + + def logp(self, x): + ... + + def kl(self, other): + ... + + def entropy(self): + ... + + @staticmethod + def required_input_dim(space): + ... + + def rsample(self): + ... + + @classmethod + def from_logits(cls, logits, **kwargs): + return Uniform(logits[:, 0], logits[:, 1]) + + logits = np.array([[0.0, 1.0], [2.0, 3.0]]) + my_dist = Uniform.from_logits(logits) + sample = my_dist.sample() + """ + raise NotImplementedError + + @classmethod + def get_partial_dist_cls( + parent_cls: "Distribution", **partial_kwargs + ) -> "Distribution": + """Returns a partial child of TorchMultiActionDistribution. + + This is useful if inputs needed to instantiate the Distribution from logits + are available, but the logits are not. + """ + + class DistributionPartial(parent_cls): + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + + @staticmethod + def _merge_kwargs(**kwargs): + """Checks if keys in kwargs don't clash with partial_kwargs.""" + overlap = set(kwargs) & set(partial_kwargs) + if overlap: + raise ValueError( + f"Cannot override the following kwargs: {overlap}.\n" + f"This is because they were already set at the time this " + f"partial class was defined." + ) + merged_kwargs = {**partial_kwargs, **kwargs} + return merged_kwargs + + @classmethod + @override(parent_cls) + def required_input_dim(cls, space: gym.Space, **kwargs) -> int: + merged_kwargs = cls._merge_kwargs(**kwargs) + assert space == merged_kwargs["space"] + return parent_cls.required_input_dim(**merged_kwargs) + + @classmethod + @override(parent_cls) + def from_logits( + cls, + logits: TensorType, + **kwargs, + ) -> "DistributionPartial": + merged_kwargs = cls._merge_kwargs(**kwargs) + distribution = parent_cls.from_logits(logits, **merged_kwargs) + # Replace the class of the returned distribution with this partial + # This makes it so that we can use type() on this distribution and + # get back the partial class. + distribution.__class__ = cls + return distribution + + # Substitute name of this partial class to match the original class. + DistributionPartial.__name__ = f"{parent_cls}Partial" + + return DistributionPartial + + def to_deterministic(self) -> "Distribution": + """Returns a deterministic equivalent for this distribution. + + Specifically, the deterministic equivalent for a Categorical distribution is a + Deterministic distribution that selects the action with maximum logit value. + Generally, the choice of the deterministic replacement is informed by + established conventions. + """ + return self diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/models/modelv2.py b/.venv/lib/python3.11/site-packages/ray/rllib/models/modelv2.py new file mode 100644 index 0000000000000000000000000000000000000000..df07150e57bac2ead3c235f0abfe23f696921b5f --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/rllib/models/modelv2.py @@ -0,0 +1,471 @@ +from collections import OrderedDict +import contextlib +import gymnasium as gym +from gymnasium.spaces import Space +import numpy as np +from typing import Dict, List, Any, Union + +from ray.rllib.models.preprocessors import get_preprocessor, RepeatedValuesPreprocessor +from ray.rllib.models.repeated_values import RepeatedValues +from ray.rllib.policy.sample_batch import SampleBatch +from ray.rllib.policy.view_requirement import ViewRequirement +from ray.rllib.utils import NullContextManager +from ray.rllib.utils.annotations import OldAPIStack +from ray.rllib.utils.deprecation import Deprecated +from ray.rllib.utils.framework import try_import_tf, try_import_torch, TensorType +from ray.rllib.utils.spaces.repeated import Repeated +from ray.rllib.utils.typing import ModelConfigDict, ModelInputDict, TensorStructType + +tf1, tf, tfv = try_import_tf() +torch, _ = try_import_torch() + + +@OldAPIStack +class ModelV2: + r"""Defines an abstract neural network model for use with RLlib. + + Custom models should extend either TFModelV2 or TorchModelV2 instead of + this class directly. + + Data flow: + obs -> forward() -> model_out + \-> value_function() -> V(s) + """ + + def __init__( + self, + obs_space: Space, + action_space: Space, + num_outputs: int, + model_config: ModelConfigDict, + name: str, + framework: str, + ): + """Initializes a ModelV2 instance. + + This method should create any variables used by the model. + + Args: + obs_space: Observation space of the target gym + env. This may have an `original_space` attribute that + specifies how to unflatten the tensor into a ragged tensor. + action_space: Action space of the target gym + env. + num_outputs: Number of output units of the model. + model_config: Config for the model, documented + in ModelCatalog. + name: Name (scope) for the model. + framework: Either "tf" or "torch". + """ + + self.obs_space: Space = obs_space + self.action_space: Space = action_space + self.num_outputs: int = num_outputs + self.model_config: ModelConfigDict = model_config + self.name: str = name or "default_model" + self.framework: str = framework + self._last_output = None + self.time_major = self.model_config.get("_time_major") + # Basic view requirement for all models: Use the observation as input. + self.view_requirements = { + SampleBatch.OBS: ViewRequirement(shift=0, space=self.obs_space), + } + + def get_initial_state(self) -> List[TensorType]: + """Get the initial recurrent state values for the model. + + Returns: + List of np.array (for tf) or Tensor (for torch) objects containing the + initial hidden state of an RNN, if applicable. + + .. testcode:: + :skipif: True + + import numpy as np + from ray.rllib.models.modelv2 import ModelV2 + class MyModel(ModelV2): + # ... + def get_initial_state(self): + return [ + np.zeros(self.cell_size, np.float32), + np.zeros(self.cell_size, np.float32), + ] + """ + return [] + + def forward( + self, + input_dict: Dict[str, TensorType], + state: List[TensorType], + seq_lens: TensorType, + ) -> (TensorType, List[TensorType]): + """Call the model with the given input tensors and state. + + Any complex observations (dicts, tuples, etc.) will be unpacked by + __call__ before being passed to forward(). To access the flattened + observation tensor, refer to input_dict["obs_flat"]. + + This method can be called any number of times. In eager execution, + each call to forward() will eagerly evaluate the model. In symbolic + execution, each call to forward creates a computation graph that + operates over the variables of this model (i.e., shares weights). + + Custom models should override this instead of __call__. + + Args: + input_dict: dictionary of input tensors, including "obs", + "obs_flat", "prev_action", "prev_reward", "is_training", + "eps_id", "agent_id", "infos", and "t". + state: list of state tensors with sizes matching those + returned by get_initial_state + the batch dimension + seq_lens: 1d tensor holding input sequence lengths + + Returns: + A tuple consisting of the model output tensor of size + [BATCH, num_outputs] and the list of new RNN state(s) if any. + + .. testcode:: + :skipif: True + + import numpy as np + from ray.rllib.models.modelv2 import ModelV2 + class MyModel(ModelV2): + # ... + def forward(self, input_dict, state, seq_lens): + model_out, self._value_out = self.base_model( + input_dict["obs"]) + return model_out, state + """ + raise NotImplementedError + + def value_function(self) -> TensorType: + """Returns the value function output for the most recent forward pass. + + Note that a `forward` call has to be performed first, before this + methods can return anything and thus that calling this method does not + cause an extra forward pass through the network. + + Returns: + Value estimate tensor of shape [BATCH]. + """ + raise NotImplementedError + + def custom_loss( + self, policy_loss: TensorType, loss_inputs: Dict[str, TensorType] + ) -> Union[List[TensorType], TensorType]: + """Override to customize the loss function used to optimize this model. + + This can be used to incorporate self-supervised losses (by defining + a loss over existing input and output tensors of this model), and + supervised losses (by defining losses over a variable-sharing copy of + this model's layers). + + You can find an runnable example in examples/custom_loss.py. + + Args: + policy_loss: List of or single policy loss(es) from the policy. + loss_inputs: map of input placeholders for rollout data. + + Returns: + List of or scalar tensor for the customized loss(es) for this + model. + """ + return policy_loss + + def metrics(self) -> Dict[str, TensorType]: + """Override to return custom metrics from your model. + + The stats will be reported as part of the learner stats, i.e., + info.learner.[policy_id, e.g. "default_policy"].model.key1=metric1 + + Returns: + The custom metrics for this model. + """ + return {} + + def __call__( + self, + input_dict: Union[SampleBatch, ModelInputDict], + state: List[Any] = None, + seq_lens: TensorType = None, + ) -> (TensorType, List[TensorType]): + """Call the model with the given input tensors and state. + + This is the method used by RLlib to execute the forward pass. It calls + forward() internally after unpacking nested observation tensors. + + Custom models should override forward() instead of __call__. + + Args: + input_dict: Dictionary of input tensors. + state: list of state tensors with sizes matching those + returned by get_initial_state + the batch dimension + seq_lens: 1D tensor holding input sequence lengths. + + Returns: + A tuple consisting of the model output tensor of size + [BATCH, output_spec.size] or a list of tensors corresponding to + output_spec.shape_list, and a list of state tensors of + [BATCH, state_size_i] if any. + """ + + # Original observations will be stored in "obs". + # Flattened (preprocessed) obs will be stored in "obs_flat". + + # SampleBatch case: Models can now be called directly with a + # SampleBatch (which also includes tracking-dict case (deprecated now), + # where tensors get automatically converted). + if isinstance(input_dict, SampleBatch): + restored = input_dict.copy(shallow=True) + else: + restored = input_dict.copy() + + # Backward compatibility. + if not state: + state = [] + i = 0 + while "state_in_{}".format(i) in input_dict: + state.append(input_dict["state_in_{}".format(i)]) + i += 1 + if seq_lens is None: + seq_lens = input_dict.get(SampleBatch.SEQ_LENS) + + # No Preprocessor used: `config._disable_preprocessor_api`=True. + # TODO: This is unnecessary for when no preprocessor is used. + # Obs are not flat then anymore. However, we'll keep this + # here for backward-compatibility until Preprocessors have + # been fully deprecated. + if self.model_config.get("_disable_preprocessor_api"): + restored["obs_flat"] = input_dict["obs"] + # Input to this Model went through a Preprocessor. + # Generate extra keys: "obs_flat" (vs "obs", which will hold the + # original obs). + else: + restored["obs"] = restore_original_dimensions( + input_dict["obs"], self.obs_space, self.framework + ) + try: + if len(input_dict["obs"].shape) > 2: + restored["obs_flat"] = flatten(input_dict["obs"], self.framework) + else: + restored["obs_flat"] = input_dict["obs"] + except AttributeError: + restored["obs_flat"] = input_dict["obs"] + + with self.context(): + res = self.forward(restored, state or [], seq_lens) + + if isinstance(input_dict, SampleBatch): + input_dict.accessed_keys = restored.accessed_keys - {"obs_flat"} + input_dict.deleted_keys = restored.deleted_keys + input_dict.added_keys = restored.added_keys - {"obs_flat"} + + if (not isinstance(res, list) and not isinstance(res, tuple)) or len(res) != 2: + raise ValueError( + "forward() must return a tuple of (output, state) tensors, " + "got {}".format(res) + ) + outputs, state_out = res + + if not isinstance(state_out, list): + raise ValueError("State output is not a list: {}".format(state_out)) + + self._last_output = outputs + return outputs, state_out if len(state_out) > 0 else (state or []) + + def last_output(self) -> TensorType: + """Returns the last output returned from calling the model.""" + return self._last_output + + def context(self) -> contextlib.AbstractContextManager: + """Returns a contextmanager for the current forward pass.""" + return NullContextManager() + + def variables( + self, as_dict: bool = False + ) -> Union[List[TensorType], Dict[str, TensorType]]: + """Returns the list (or a dict) of variables for this model. + + Args: + as_dict: Whether variables should be returned as dict-values + (using descriptive str keys). + + Returns: + The list (or dict if `as_dict` is True) of all variables of this + ModelV2. + """ + raise NotImplementedError + + def trainable_variables( + self, as_dict: bool = False + ) -> Union[List[TensorType], Dict[str, TensorType]]: + """Returns the list of trainable variables for this model. + + Args: + as_dict: Whether variables should be returned as dict-values + (using descriptive keys). + + Returns: + The list (or dict if `as_dict` is True) of all trainable + (tf)/requires_grad (torch) variables of this ModelV2. + """ + raise NotImplementedError + + def is_time_major(self) -> bool: + """If True, data for calling this ModelV2 must be in time-major format. + + Returns + Whether this ModelV2 requires a time-major (TxBx...) data + format. + """ + return self.time_major is True + + @Deprecated(error=True) + def import_from_h5(self, *args, **kwargs): + pass + + +@OldAPIStack +def flatten(obs: TensorType, framework: str) -> TensorType: + """Flatten the given tensor.""" + if framework in ["tf2", "tf"]: + return tf1.keras.layers.Flatten()(obs) + elif framework == "torch": + assert torch is not None + return torch.flatten(obs, start_dim=1) + else: + raise NotImplementedError("flatten", framework) + + +@OldAPIStack +def restore_original_dimensions( + obs: TensorType, obs_space: Space, tensorlib: Any = tf +) -> TensorStructType: + """Unpacks Dict and Tuple space observations into their original form. + + This is needed since we flatten Dict and Tuple observations in transit + within a SampleBatch. Before sending them to the model though, we should + unflatten them into Dicts or Tuples of tensors. + + Args: + obs: The flattened observation tensor. + obs_space: The flattened obs space. If this has the + `original_space` attribute, we will unflatten the tensor to that + shape. + tensorlib: The library used to unflatten (reshape) the array/tensor. + + Returns: + single tensor or dict / tuple of tensors matching the original + observation space. + """ + + if tensorlib in ["tf", "tf2"]: + assert tf is not None + tensorlib = tf + elif tensorlib == "torch": + assert torch is not None + tensorlib = torch + elif tensorlib == "numpy": + assert np is not None + tensorlib = np + original_space = getattr(obs_space, "original_space", obs_space) + return _unpack_obs(obs, original_space, tensorlib=tensorlib) + + +# Cache of preprocessors, for if the user is calling unpack obs often. +_cache = {} + + +@OldAPIStack +def _unpack_obs(obs: TensorType, space: Space, tensorlib: Any = tf) -> TensorStructType: + """Unpack a flattened Dict or Tuple observation array/tensor. + + Args: + obs: The flattened observation tensor, with last dimension equal to + the flat size and any number of batch dimensions. For example, for + Box(4,), the obs may have shape [B, 4], or [B, N, M, 4] in case + the Box was nested under two Repeated spaces. + space: The original space prior to flattening + tensorlib: The library used to unflatten (reshape) the array/tensor + """ + + if isinstance(space, (gym.spaces.Dict, gym.spaces.Tuple, Repeated)): + # Already unpacked? + if (isinstance(space, gym.spaces.Tuple) and isinstance(obs, (list, tuple))) or ( + isinstance(space, gym.spaces.Dict) and isinstance(obs, dict) + ): + return obs + # Unpack using preprocessor + if id(space) in _cache: + prep = _cache[id(space)] + else: + prep = get_preprocessor(space)(space) + # Make an attempt to cache the result, if enough space left. + if len(_cache) < 999: + _cache[id(space)] = prep + if len(obs.shape) < 2 or obs.shape[-1] != prep.shape[0]: + raise ValueError( + "Expected flattened obs shape of [..., {}], got {}".format( + prep.shape[0], obs.shape + ) + ) + offset = 0 + if tensorlib == tf: + + def get_value(v): + if v is None: + return -1 + elif isinstance(v, int): + return v + elif v.value is None: + return -1 + else: + return v.value + + batch_dims = [get_value(v) for v in obs.shape[:-1]] + else: + batch_dims = list(obs.shape[:-1]) + if isinstance(space, gym.spaces.Tuple): + assert len(prep.preprocessors) == len(space.spaces), len( + prep.preprocessors + ) == len(space.spaces) + u = [] + for p, v in zip(prep.preprocessors, space.spaces): + obs_slice = obs[..., offset : offset + p.size] + offset += p.size + u.append( + _unpack_obs( + tensorlib.reshape(obs_slice, batch_dims + list(p.shape)), + v, + tensorlib=tensorlib, + ) + ) + elif isinstance(space, gym.spaces.Dict): + assert len(prep.preprocessors) == len(space.spaces), len( + prep.preprocessors + ) == len(space.spaces) + u = OrderedDict() + for p, (k, v) in zip(prep.preprocessors, space.spaces.items()): + obs_slice = obs[..., offset : offset + p.size] + offset += p.size + u[k] = _unpack_obs( + tensorlib.reshape(obs_slice, batch_dims + list(p.shape)), + v, + tensorlib=tensorlib, + ) + # Repeated space. + else: + assert isinstance(prep, RepeatedValuesPreprocessor), prep + child_size = prep.child_preprocessor.size + # The list lengths are stored in the first slot of the flat obs. + lengths = obs[..., 0] + # [B, ..., 1 + max_len * child_sz] -> [B, ..., max_len, child_sz] + with_repeat_dim = tensorlib.reshape( + obs[..., 1:], batch_dims + [space.max_len, child_size] + ) + # Retry the unpack, dropping the List container space. + u = _unpack_obs(with_repeat_dim, space.child_space, tensorlib=tensorlib) + return RepeatedValues(u, lengths=lengths, max_len=prep._obs_space.max_len) + return u + else: + return obs diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/models/preprocessors.py b/.venv/lib/python3.11/site-packages/ray/rllib/models/preprocessors.py new file mode 100644 index 0000000000000000000000000000000000000000..ad15d0c155124e38ba7fb8c84b162b7a564d4c9a --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/rllib/models/preprocessors.py @@ -0,0 +1,447 @@ +from collections import OrderedDict +import logging +import numpy as np +import gymnasium as gym +from typing import Any, List + +from ray.rllib.utils.annotations import OldAPIStack, override +from ray.rllib.utils.spaces.repeated import Repeated +from ray.rllib.utils.typing import TensorType +from ray.rllib.utils.images import resize +from ray.rllib.utils.spaces.space_utils import convert_element_to_space_type + +ATARI_OBS_SHAPE = (210, 160, 3) +ATARI_RAM_OBS_SHAPE = (128,) + +# Only validate env observations vs the observation space every n times in a +# Preprocessor. +OBS_VALIDATION_INTERVAL = 100 + +logger = logging.getLogger(__name__) + + +@OldAPIStack +class Preprocessor: + """Defines an abstract observation preprocessor function. + + Attributes: + shape (List[int]): Shape of the preprocessed output. + """ + + def __init__(self, obs_space: gym.Space, options: dict = None): + _legacy_patch_shapes(obs_space) + self._obs_space = obs_space + if not options: + from ray.rllib.models.catalog import MODEL_DEFAULTS + + self._options = MODEL_DEFAULTS.copy() + else: + self._options = options + self.shape = self._init_shape(obs_space, self._options) + self._size = int(np.prod(self.shape)) + self._i = 0 + self._obs_for_type_matching = self._obs_space.sample() + + def _init_shape(self, obs_space: gym.Space, options: dict) -> List[int]: + """Returns the shape after preprocessing.""" + raise NotImplementedError + + def transform(self, observation: TensorType) -> np.ndarray: + """Returns the preprocessed observation.""" + raise NotImplementedError + + def write(self, observation: TensorType, array: np.ndarray, offset: int) -> None: + """Alternative to transform for more efficient flattening.""" + array[offset : offset + self._size] = self.transform(observation) + + def check_shape(self, observation: Any) -> None: + """Checks the shape of the given observation.""" + if self._i % OBS_VALIDATION_INTERVAL == 0: + # Convert lists to np.ndarrays. + if type(observation) is list and isinstance( + self._obs_space, gym.spaces.Box + ): + observation = np.array(observation).astype(np.float32) + if not self._obs_space.contains(observation): + observation = convert_element_to_space_type( + observation, self._obs_for_type_matching + ) + try: + if not self._obs_space.contains(observation): + raise ValueError( + "Observation ({} dtype={}) outside given space ({})!".format( + observation, + observation.dtype + if isinstance(self._obs_space, gym.spaces.Box) + else None, + self._obs_space, + ) + ) + except AttributeError as e: + raise ValueError( + "Observation for a Box/MultiBinary/MultiDiscrete space " + "should be an np.array, not a Python list.", + observation, + ) from e + self._i += 1 + + @property + def size(self) -> int: + return self._size + + @property + def observation_space(self) -> gym.Space: + obs_space = gym.spaces.Box(-1.0, 1.0, self.shape, dtype=np.float32) + # Stash the unwrapped space so that we can unwrap dict and tuple spaces + # automatically in modelv2.py + classes = ( + DictFlatteningPreprocessor, + OneHotPreprocessor, + RepeatedValuesPreprocessor, + TupleFlatteningPreprocessor, + AtariRamPreprocessor, + GenericPixelPreprocessor, + ) + if isinstance(self, classes): + obs_space.original_space = self._obs_space + return obs_space + + +@OldAPIStack +class GenericPixelPreprocessor(Preprocessor): + """Generic image preprocessor. + + Note: for Atari games, use config {"preprocessor_pref": "deepmind"} + instead for deepmind-style Atari preprocessing. + """ + + @override(Preprocessor) + def _init_shape(self, obs_space: gym.Space, options: dict) -> List[int]: + self._grayscale = options.get("grayscale") + self._zero_mean = options.get("zero_mean") + self._dim = options.get("dim") + if self._grayscale: + shape = (self._dim, self._dim, 1) + else: + shape = (self._dim, self._dim, 3) + + return shape + + @override(Preprocessor) + def transform(self, observation: TensorType) -> np.ndarray: + """Downsamples images from (210, 160, 3) by the configured factor.""" + self.check_shape(observation) + scaled = observation[25:-25, :, :] + if self._dim < 84: + scaled = resize(scaled, height=84, width=84) + # OpenAI: Resize by half, then down to 42x42 (essentially mipmapping). + # If we resize directly we lose pixels that, when mapped to 42x42, + # aren't close enough to the pixel boundary. + scaled = resize(scaled, height=self._dim, width=self._dim) + if self._grayscale: + scaled = scaled.mean(2) + scaled = scaled.astype(np.float32) + # Rescale needed for maintaining 1 channel + scaled = np.reshape(scaled, [self._dim, self._dim, 1]) + if self._zero_mean: + scaled = (scaled - 128) / 128 + else: + scaled *= 1.0 / 255.0 + return scaled + + +@OldAPIStack +class AtariRamPreprocessor(Preprocessor): + @override(Preprocessor) + def _init_shape(self, obs_space: gym.Space, options: dict) -> List[int]: + return (128,) + + @override(Preprocessor) + def transform(self, observation: TensorType) -> np.ndarray: + self.check_shape(observation) + return (observation.astype("float32") - 128) / 128 + + +@OldAPIStack +class OneHotPreprocessor(Preprocessor): + """One-hot preprocessor for Discrete and MultiDiscrete spaces. + + .. testcode:: + :skipif: True + + self.transform(Discrete(3).sample()) + + .. testoutput:: + + np.array([0.0, 1.0, 0.0]) + + .. testcode:: + :skipif: True + + self.transform(MultiDiscrete([2, 3]).sample()) + + .. testoutput:: + + np.array([0.0, 1.0, 0.0, 0.0, 1.0]) + """ + + @override(Preprocessor) + def _init_shape(self, obs_space: gym.Space, options: dict) -> List[int]: + if isinstance(obs_space, gym.spaces.Discrete): + return (self._obs_space.n,) + else: + return (np.sum(self._obs_space.nvec),) + + @override(Preprocessor) + def transform(self, observation: TensorType) -> np.ndarray: + self.check_shape(observation) + return gym.spaces.utils.flatten(self._obs_space, observation).astype(np.float32) + + @override(Preprocessor) + def write(self, observation: TensorType, array: np.ndarray, offset: int) -> None: + array[offset : offset + self.size] = self.transform(observation) + + +@OldAPIStack +class NoPreprocessor(Preprocessor): + @override(Preprocessor) + def _init_shape(self, obs_space: gym.Space, options: dict) -> List[int]: + return self._obs_space.shape + + @override(Preprocessor) + def transform(self, observation: TensorType) -> np.ndarray: + self.check_shape(observation) + return observation + + @override(Preprocessor) + def write(self, observation: TensorType, array: np.ndarray, offset: int) -> None: + array[offset : offset + self._size] = np.array(observation, copy=False).ravel() + + @property + @override(Preprocessor) + def observation_space(self) -> gym.Space: + return self._obs_space + + +@OldAPIStack +class MultiBinaryPreprocessor(Preprocessor): + """Preprocessor that turns a MultiBinary space into a Box. + + Note: Before RLModules were introduced, RLlib's ModelCatalogV2 would produce + ComplexInputNetworks that treat MultiBinary spaces as Boxes. This preprocessor is + needed to get rid of the ComplexInputNetworks and use RLModules instead because + RLModules lack the logic to handle MultiBinary or other non-Box spaces. + """ + + @override(Preprocessor) + def _init_shape(self, obs_space: gym.Space, options: dict) -> List[int]: + return self._obs_space.shape + + @override(Preprocessor) + def transform(self, observation: TensorType) -> np.ndarray: + # The shape stays the same, but the dtype changes. + self.check_shape(observation) + return observation.astype(np.float32) + + @override(Preprocessor) + def write(self, observation: TensorType, array: np.ndarray, offset: int) -> None: + array[offset : offset + self._size] = np.array(observation, copy=False).ravel() + + @property + @override(Preprocessor) + def observation_space(self) -> gym.Space: + obs_space = gym.spaces.Box(0.0, 1.0, self.shape, dtype=np.float32) + obs_space.original_space = self._obs_space + return obs_space + + +@OldAPIStack +class TupleFlatteningPreprocessor(Preprocessor): + """Preprocesses each tuple element, then flattens it all into a vector. + + RLlib models will unpack the flattened output before _build_layers_v2(). + """ + + @override(Preprocessor) + def _init_shape(self, obs_space: gym.Space, options: dict) -> List[int]: + assert isinstance(self._obs_space, gym.spaces.Tuple) + size = 0 + self.preprocessors = [] + for i in range(len(self._obs_space.spaces)): + space = self._obs_space.spaces[i] + logger.debug("Creating sub-preprocessor for {}".format(space)) + preprocessor_class = get_preprocessor(space) + if preprocessor_class is not None: + preprocessor = preprocessor_class(space, self._options) + size += preprocessor.size + else: + preprocessor = None + size += int(np.prod(space.shape)) + self.preprocessors.append(preprocessor) + return (size,) + + @override(Preprocessor) + def transform(self, observation: TensorType) -> np.ndarray: + self.check_shape(observation) + array = np.zeros(self.shape, dtype=np.float32) + self.write(observation, array, 0) + return array + + @override(Preprocessor) + def write(self, observation: TensorType, array: np.ndarray, offset: int) -> None: + assert len(observation) == len(self.preprocessors), observation + for o, p in zip(observation, self.preprocessors): + p.write(o, array, offset) + offset += p.size + + +@OldAPIStack +class DictFlatteningPreprocessor(Preprocessor): + """Preprocesses each dict value, then flattens it all into a vector. + + RLlib models will unpack the flattened output before _build_layers_v2(). + """ + + @override(Preprocessor) + def _init_shape(self, obs_space: gym.Space, options: dict) -> List[int]: + assert isinstance(self._obs_space, gym.spaces.Dict) + size = 0 + self.preprocessors = [] + for space in self._obs_space.spaces.values(): + logger.debug("Creating sub-preprocessor for {}".format(space)) + preprocessor_class = get_preprocessor(space) + if preprocessor_class is not None: + preprocessor = preprocessor_class(space, self._options) + size += preprocessor.size + else: + preprocessor = None + size += int(np.prod(space.shape)) + self.preprocessors.append(preprocessor) + return (size,) + + @override(Preprocessor) + def transform(self, observation: TensorType) -> np.ndarray: + self.check_shape(observation) + array = np.zeros(self.shape, dtype=np.float32) + self.write(observation, array, 0) + return array + + @override(Preprocessor) + def write(self, observation: TensorType, array: np.ndarray, offset: int) -> None: + if not isinstance(observation, OrderedDict): + observation = OrderedDict(sorted(observation.items())) + assert len(observation) == len(self.preprocessors), ( + len(observation), + len(self.preprocessors), + ) + for o, p in zip(observation.values(), self.preprocessors): + p.write(o, array, offset) + offset += p.size + + +@OldAPIStack +class RepeatedValuesPreprocessor(Preprocessor): + """Pads and batches the variable-length list value.""" + + @override(Preprocessor) + def _init_shape(self, obs_space: gym.Space, options: dict) -> List[int]: + assert isinstance(self._obs_space, Repeated) + child_space = obs_space.child_space + self.child_preprocessor = get_preprocessor(child_space)( + child_space, self._options + ) + # The first slot encodes the list length. + size = 1 + self.child_preprocessor.size * obs_space.max_len + return (size,) + + @override(Preprocessor) + def transform(self, observation: TensorType) -> np.ndarray: + array = np.zeros(self.shape) + if isinstance(observation, list): + for elem in observation: + self.child_preprocessor.check_shape(elem) + else: + pass # ValueError will be raised in write() below. + self.write(observation, array, 0) + return array + + @override(Preprocessor) + def write(self, observation: TensorType, array: np.ndarray, offset: int) -> None: + if not isinstance(observation, (list, np.ndarray)): + raise ValueError( + "Input for {} must be list type, got {}".format(self, observation) + ) + elif len(observation) > self._obs_space.max_len: + raise ValueError( + "Input {} exceeds max len of space {}".format( + observation, self._obs_space.max_len + ) + ) + # The first slot encodes the list length. + array[offset] = len(observation) + for i, elem in enumerate(observation): + offset_i = offset + 1 + i * self.child_preprocessor.size + self.child_preprocessor.write(elem, array, offset_i) + + +@OldAPIStack +def get_preprocessor(space: gym.Space, include_multi_binary=False) -> type: + """Returns an appropriate preprocessor class for the given space.""" + + _legacy_patch_shapes(space) + obs_shape = space.shape + + if isinstance(space, (gym.spaces.Discrete, gym.spaces.MultiDiscrete)): + preprocessor = OneHotPreprocessor + elif obs_shape == ATARI_OBS_SHAPE: + logger.debug( + "Defaulting to RLlib's GenericPixelPreprocessor because input " + "space has the atari-typical shape {}. Turn this behaviour off by setting " + "`preprocessor_pref=None` or " + "`preprocessor_pref='deepmind'` or disabling the preprocessing API " + "altogether with `_disable_preprocessor_api=True`.".format(ATARI_OBS_SHAPE) + ) + preprocessor = GenericPixelPreprocessor + elif obs_shape == ATARI_RAM_OBS_SHAPE: + logger.debug( + "Defaulting to RLlib's AtariRamPreprocessor because input " + "space has the atari-typical shape {}. Turn this behaviour off by setting " + "`preprocessor_pref=None` or " + "`preprocessor_pref='deepmind' or disabling the preprocessing API " + "altogether with `_disable_preprocessor_api=True`." + "`.".format(ATARI_OBS_SHAPE) + ) + preprocessor = AtariRamPreprocessor + elif isinstance(space, gym.spaces.Tuple): + preprocessor = TupleFlatteningPreprocessor + elif isinstance(space, gym.spaces.Dict): + preprocessor = DictFlatteningPreprocessor + elif isinstance(space, Repeated): + preprocessor = RepeatedValuesPreprocessor + # We usually only want to include this when using RLModules + elif isinstance(space, gym.spaces.MultiBinary) and include_multi_binary: + preprocessor = MultiBinaryPreprocessor + else: + preprocessor = NoPreprocessor + + return preprocessor + + +def _legacy_patch_shapes(space: gym.Space) -> List[int]: + """Assigns shapes to spaces that don't have shapes. + + This is only needed for older gym versions that don't set shapes properly + for Tuple and Discrete spaces. + """ + + if not hasattr(space, "shape"): + if isinstance(space, gym.spaces.Discrete): + space.shape = () + elif isinstance(space, gym.spaces.Tuple): + shapes = [] + for s in space.spaces: + shape = _legacy_patch_shapes(s) + shapes.append(shape) + space.shape = tuple(shapes) + + return space.shape diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/models/repeated_values.py b/.venv/lib/python3.11/site-packages/ray/rllib/models/repeated_values.py new file mode 100644 index 0000000000000000000000000000000000000000..7ecef777f667bffec0008bb0b1fdf1aa7c88d52e --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/rllib/models/repeated_values.py @@ -0,0 +1,204 @@ +from typing import List + +from ray.rllib.utils.annotations import OldAPIStack +from ray.rllib.utils.typing import TensorType, TensorStructType + + +@OldAPIStack +class RepeatedValues: + """Represents a variable-length list of items from spaces.Repeated. + + RepeatedValues are created when you use spaces.Repeated, and are + accessible as part of input_dict["obs"] in ModelV2 forward functions. + + Example: + Suppose the gym space definition was: + Repeated(Repeated(Box(K), N), M) + + Then in the model forward function, input_dict["obs"] is of type: + RepeatedValues(RepeatedValues()) + + The tensor is accessible via: + input_dict["obs"].values.values + + And the actual data lengths via: + # outer repetition, shape [B], range [0, M] + input_dict["obs"].lengths + -and- + # inner repetition, shape [B, M], range [0, N] + input_dict["obs"].values.lengths + + Attributes: + values: The padded data tensor of shape [B, max_len, ..., sz], + where B is the batch dimension, max_len is the max length of this + list, followed by any number of sub list max lens, followed by the + actual data size. + lengths (List[int]): Tensor of shape [B, ...] that represents the + number of valid items in each list. When the list is nested within + other lists, there will be extra dimensions for the parent list + max lens. + max_len: The max number of items allowed in each list. + + TODO(ekl): support conversion to tf.RaggedTensor. + """ + + def __init__(self, values: TensorType, lengths: List[int], max_len: int): + self.values = values + self.lengths = lengths + self.max_len = max_len + self._unbatched_repr = None + + def unbatch_all(self) -> List[List[TensorType]]: + """Unbatch both the repeat and batch dimensions into Python lists. + + This is only supported in PyTorch / TF eager mode. + + This lets you view the data unbatched in its original form, but is + not efficient for processing. + + .. testcode:: + :skipif: True + + batch = RepeatedValues() + items = batch.unbatch_all() + print(len(items) == B) + + .. testoutput:: + + True + + .. testcode:: + :skipif: True + + print(max(len(x) for x in items) <= N) + + .. testoutput:: + + True + + .. testcode:: + :skipif: True + + print(items) + + .. testoutput:: + + [[, ..., ], + ... + [, ], + ... + [], + ... + [, ..., ]] + """ + + if self._unbatched_repr is None: + B = _get_batch_dim_helper(self.values) + if B is None: + raise ValueError( + "Cannot call unbatch_all() when batch_dim is unknown. " + "This is probably because you are using TF graph mode." + ) + else: + B = int(B) + slices = self.unbatch_repeat_dim() + result = [] + for i in range(B): + if hasattr(self.lengths[i], "item"): + dynamic_len = int(self.lengths[i].item()) + else: + dynamic_len = int(self.lengths[i].numpy()) + dynamic_slice = [] + for j in range(dynamic_len): + dynamic_slice.append(_batch_index_helper(slices, i, j)) + result.append(dynamic_slice) + self._unbatched_repr = result + + return self._unbatched_repr + + def unbatch_repeat_dim(self) -> List[TensorType]: + """Unbatches the repeat dimension (the one `max_len` in size). + + This removes the repeat dimension. The result will be a Python list of + with length `self.max_len`. Note that the data is still padded. + + .. testcode:: + :skipif: True + + batch = RepeatedValues() + items = batch.unbatch() + len(items) == batch.max_len + + .. testoutput:: + + True + + .. testcode:: + :skipif: True + + print(items) + + .. testoutput:: + + [, ..., ] + """ + return _unbatch_helper(self.values, self.max_len) + + def __repr__(self): + return "RepeatedValues(value={}, lengths={}, max_len={})".format( + repr(self.values), repr(self.lengths), self.max_len + ) + + def __str__(self): + return repr(self) + + +def _get_batch_dim_helper(v: TensorStructType) -> int: + """Tries to find the batch dimension size of v, or None.""" + if isinstance(v, dict): + for u in v.values(): + return _get_batch_dim_helper(u) + elif isinstance(v, tuple): + return _get_batch_dim_helper(v[0]) + elif isinstance(v, RepeatedValues): + return _get_batch_dim_helper(v.values) + else: + B = v.shape[0] + if hasattr(B, "value"): + B = B.value # TensorFlow + return B + + +def _unbatch_helper(v: TensorStructType, max_len: int) -> TensorStructType: + """Recursively unpacks the repeat dimension (max_len).""" + if isinstance(v, dict): + return {k: _unbatch_helper(u, max_len) for (k, u) in v.items()} + elif isinstance(v, tuple): + return tuple(_unbatch_helper(u, max_len) for u in v) + elif isinstance(v, RepeatedValues): + unbatched = _unbatch_helper(v.values, max_len) + return [ + RepeatedValues(u, v.lengths[:, i, ...], v.max_len) + for i, u in enumerate(unbatched) + ] + else: + return [v[:, i, ...] for i in range(max_len)] + + +def _batch_index_helper(v: TensorStructType, i: int, j: int) -> TensorStructType: + """Selects the item at the ith batch index and jth repetition.""" + if isinstance(v, dict): + return {k: _batch_index_helper(u, i, j) for (k, u) in v.items()} + elif isinstance(v, tuple): + return tuple(_batch_index_helper(u, i, j) for u in v) + elif isinstance(v, list): + # This is the output of unbatch_repeat_dim(). Unfortunately we have to + # process it here instead of in unbatch_all(), since it may be buried + # under a dict / tuple. + return _batch_index_helper(v[j], i, j) + elif isinstance(v, RepeatedValues): + unbatched = v.unbatch_all() + # Don't need to select j here; that's already done in unbatch_all. + return unbatched[i] + else: + return v[i, ...] diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/models/tf/__init__.py b/.venv/lib/python3.11/site-packages/ray/rllib/models/tf/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..86d33b39d455bcd6f43da151586444df41711b1c --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/rllib/models/tf/__init__.py @@ -0,0 +1,11 @@ +from ray.rllib.models.tf.tf_modelv2 import TFModelV2 +from ray.rllib.models.tf.fcnet import FullyConnectedNetwork +from ray.rllib.models.tf.recurrent_net import RecurrentNetwork +from ray.rllib.models.tf.visionnet import VisionNetwork + +__all__ = [ + "FullyConnectedNetwork", + "RecurrentNetwork", + "TFModelV2", + "VisionNetwork", +] diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/models/tf/__pycache__/complex_input_net.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/models/tf/__pycache__/complex_input_net.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..1ba6ff259ea095560cb79cc7d965365700583849 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/models/tf/__pycache__/complex_input_net.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/models/tf/__pycache__/misc.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/models/tf/__pycache__/misc.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..477cb69891b019020d1dbcb8c66ac4b3d751387f Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/models/tf/__pycache__/misc.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/models/tf/attention_net.py b/.venv/lib/python3.11/site-packages/ray/rllib/models/tf/attention_net.py new file mode 100644 index 0000000000000000000000000000000000000000..886580fce177a0e075eb2d252ef869e181f5ae1b --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/rllib/models/tf/attention_net.py @@ -0,0 +1,573 @@ +""" +[1] - Attention Is All You Need - Vaswani, Jones, Shazeer, Parmar, + Uszkoreit, Gomez, Kaiser - Google Brain/Research, U Toronto - 2017. + https://arxiv.org/pdf/1706.03762.pdf +[2] - Stabilizing Transformers for Reinforcement Learning - E. Parisotto + et al. - DeepMind - 2019. https://arxiv.org/pdf/1910.06764.pdf +[3] - Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context. + Z. Dai, Z. Yang, et al. - Carnegie Mellon U - 2019. + https://www.aclweb.org/anthology/P19-1285.pdf +""" +import gymnasium as gym +from gymnasium.spaces import Box, Discrete, MultiDiscrete +import numpy as np +import tree # pip install dm_tree +from typing import Any, Dict, Optional, Union + +from ray.rllib.models.modelv2 import ModelV2 +from ray.rllib.models.tf.layers import ( + GRUGate, + RelativeMultiHeadAttention, + SkipConnection, +) +from ray.rllib.models.tf.tf_modelv2 import TFModelV2 +from ray.rllib.models.tf.recurrent_net import RecurrentNetwork +from ray.rllib.policy.sample_batch import SampleBatch +from ray.rllib.policy.view_requirement import ViewRequirement +from ray.rllib.utils.annotations import OldAPIStack, override +from ray.rllib.utils.framework import try_import_tf +from ray.rllib.utils.spaces.space_utils import get_base_struct_from_space +from ray.rllib.utils.tf_utils import flatten_inputs_to_1d_tensor, one_hot +from ray.rllib.utils.typing import ModelConfigDict, TensorType, List +from ray.rllib.utils.deprecation import deprecation_warning +from ray.util import log_once + +tf1, tf, tfv = try_import_tf() + + +@OldAPIStack +class PositionwiseFeedforward(tf.keras.layers.Layer if tf else object): + """A 2x linear layer with ReLU activation in between described in [1]. + + Each timestep coming from the attention head will be passed through this + layer separately. + """ + + def __init__( + self, + out_dim: int, + hidden_dim: int, + output_activation: Optional[Any] = None, + **kwargs, + ): + super().__init__(**kwargs) + + self._hidden_layer = tf.keras.layers.Dense( + hidden_dim, + activation=tf.nn.relu, + ) + + self._output_layer = tf.keras.layers.Dense( + out_dim, activation=output_activation + ) + if log_once("positionwise_feedforward_tf"): + deprecation_warning( + old="rllib.models.tf.attention_net.PositionwiseFeedforward", + ) + + def call(self, inputs: TensorType, **kwargs) -> TensorType: + del kwargs + output = self._hidden_layer(inputs) + return self._output_layer(output) + + +@OldAPIStack +class TrXLNet(RecurrentNetwork): + """A TrXL net Model described in [1].""" + + def __init__( + self, + observation_space: gym.spaces.Space, + action_space: gym.spaces.Space, + num_outputs: int, + model_config: ModelConfigDict, + name: str, + num_transformer_units: int, + attention_dim: int, + num_heads: int, + head_dim: int, + position_wise_mlp_dim: int, + ): + """Initializes a TrXLNet object. + + Args: + num_transformer_units: The number of Transformer repeats to + use (denoted L in [2]). + attention_dim: The input and output dimensions of one + Transformer unit. + num_heads: The number of attention heads to use in parallel. + Denoted as `H` in [3]. + head_dim: The dimension of a single(!) attention head within + a multi-head attention unit. Denoted as `d` in [3]. + position_wise_mlp_dim: The dimension of the hidden layer + within the position-wise MLP (after the multi-head attention + block within one Transformer unit). This is the size of the + first of the two layers within the PositionwiseFeedforward. The + second layer always has size=`attention_dim`. + """ + if log_once("trxl_net_tf"): + deprecation_warning( + old="rllib.models.tf.attention_net.TrXLNet", + ) + super().__init__( + observation_space, action_space, num_outputs, model_config, name + ) + + self.num_transformer_units = num_transformer_units + self.attention_dim = attention_dim + self.num_heads = num_heads + self.head_dim = head_dim + self.max_seq_len = model_config["max_seq_len"] + self.obs_dim = observation_space.shape[0] + + inputs = tf.keras.layers.Input( + shape=(self.max_seq_len, self.obs_dim), name="inputs" + ) + E_out = tf.keras.layers.Dense(attention_dim)(inputs) + + for _ in range(self.num_transformer_units): + MHA_out = SkipConnection( + RelativeMultiHeadAttention( + out_dim=attention_dim, + num_heads=num_heads, + head_dim=head_dim, + input_layernorm=False, + output_activation=None, + ), + fan_in_layer=None, + )(E_out) + E_out = SkipConnection( + PositionwiseFeedforward(attention_dim, position_wise_mlp_dim) + )(MHA_out) + E_out = tf.keras.layers.LayerNormalization(axis=-1)(E_out) + + # Postprocess TrXL output with another hidden layer and compute values. + logits = tf.keras.layers.Dense( + self.num_outputs, activation=tf.keras.activations.linear, name="logits" + )(E_out) + + self.base_model = tf.keras.models.Model([inputs], [logits]) + + @override(RecurrentNetwork) + def forward_rnn( + self, inputs: TensorType, state: List[TensorType], seq_lens: TensorType + ) -> (TensorType, List[TensorType]): + # To make Attention work with current RLlib's ModelV2 API: + # We assume `state` is the history of L recent observations (all + # concatenated into one tensor) and append the current inputs to the + # end and only keep the most recent (up to `max_seq_len`). This allows + # us to deal with timestep-wise inference and full sequence training + # within the same logic. + observations = state[0] + observations = tf.concat((observations, inputs), axis=1)[:, -self.max_seq_len :] + logits = self.base_model([observations]) + T = tf.shape(inputs)[1] # Length of input segment (time). + logits = logits[:, -T:] + + return logits, [observations] + + @override(RecurrentNetwork) + def get_initial_state(self) -> List[np.ndarray]: + # State is the T last observations concat'd together into one Tensor. + # Plus all Transformer blocks' E(l) outputs concat'd together (up to + # tau timesteps). + return [np.zeros((self.max_seq_len, self.obs_dim), np.float32)] + + +class GTrXLNet(RecurrentNetwork): + """A GTrXL net Model described in [2]. + + This is still in an experimental phase. + Can be used as a drop-in replacement for LSTMs in PPO and IMPALA. + + To use this network as a replacement for an RNN, configure your Algorithm + as follows: + + Examples: + >> config["model"]["custom_model"] = GTrXLNet + >> config["model"]["max_seq_len"] = 10 + >> config["model"]["custom_model_config"] = { + >> num_transformer_units=1, + >> attention_dim=32, + >> num_heads=2, + >> memory_inference=100, + >> memory_training=50, + >> etc.. + >> } + """ + + def __init__( + self, + observation_space: gym.spaces.Space, + action_space: gym.spaces.Space, + num_outputs: Optional[int], + model_config: ModelConfigDict, + name: str, + *, + num_transformer_units: int = 1, + attention_dim: int = 64, + num_heads: int = 2, + memory_inference: int = 50, + memory_training: int = 50, + head_dim: int = 32, + position_wise_mlp_dim: int = 32, + init_gru_gate_bias: float = 2.0, + ): + """Initializes a GTrXLNet instance. + + Args: + num_transformer_units: The number of Transformer repeats to + use (denoted L in [2]). + attention_dim: The input and output dimensions of one + Transformer unit. + num_heads: The number of attention heads to use in parallel. + Denoted as `H` in [3]. + memory_inference: The number of timesteps to concat (time + axis) and feed into the next transformer unit as inference + input. The first transformer unit will receive this number of + past observations (plus the current one), instead. + memory_training: The number of timesteps to concat (time + axis) and feed into the next transformer unit as training + input (plus the actual input sequence of len=max_seq_len). + The first transformer unit will receive this number of + past observations (plus the input sequence), instead. + head_dim: The dimension of a single(!) attention head within + a multi-head attention unit. Denoted as `d` in [3]. + position_wise_mlp_dim: The dimension of the hidden layer + within the position-wise MLP (after the multi-head attention + block within one Transformer unit). This is the size of the + first of the two layers within the PositionwiseFeedforward. The + second layer always has size=`attention_dim`. + init_gru_gate_bias: Initial bias values for the GRU gates + (two GRUs per Transformer unit, one after the MHA, one after + the position-wise MLP). + """ + super().__init__( + observation_space, action_space, num_outputs, model_config, name + ) + + self.num_transformer_units = num_transformer_units + self.attention_dim = attention_dim + self.num_heads = num_heads + self.memory_inference = memory_inference + self.memory_training = memory_training + self.head_dim = head_dim + self.max_seq_len = model_config["max_seq_len"] + self.obs_dim = observation_space.shape[0] + + # Raw observation input (plus (None) time axis). + input_layer = tf.keras.layers.Input(shape=(None, self.obs_dim), name="inputs") + memory_ins = [ + tf.keras.layers.Input( + shape=(None, self.attention_dim), + dtype=tf.float32, + name="memory_in_{}".format(i), + ) + for i in range(self.num_transformer_units) + ] + + # Map observation dim to input/output transformer (attention) dim. + E_out = tf.keras.layers.Dense(self.attention_dim)(input_layer) + # Output, collected and concat'd to build the internal, tau-len + # Memory units used for additional contextual information. + memory_outs = [E_out] + + # 2) Create L Transformer blocks according to [2]. + for i in range(self.num_transformer_units): + # RelativeMultiHeadAttention part. + MHA_out = SkipConnection( + RelativeMultiHeadAttention( + out_dim=self.attention_dim, + num_heads=num_heads, + head_dim=head_dim, + input_layernorm=True, + output_activation=tf.nn.relu, + ), + fan_in_layer=GRUGate(init_gru_gate_bias), + name="mha_{}".format(i + 1), + )(E_out, memory=memory_ins[i]) + # Position-wise MLP part. + E_out = SkipConnection( + tf.keras.Sequential( + ( + tf.keras.layers.LayerNormalization(axis=-1), + PositionwiseFeedforward( + out_dim=self.attention_dim, + hidden_dim=position_wise_mlp_dim, + output_activation=tf.nn.relu, + ), + ) + ), + fan_in_layer=GRUGate(init_gru_gate_bias), + name="pos_wise_mlp_{}".format(i + 1), + )(MHA_out) + # Output of position-wise MLP == E(l-1), which is concat'd + # to the current Mem block (M(l-1)) to yield E~(l-1), which is then + # used by the next transformer block. + memory_outs.append(E_out) + + self._logits = None + self._value_out = None + + # Postprocess TrXL output with another hidden layer and compute values. + if num_outputs is not None: + self._logits = tf.keras.layers.Dense( + self.num_outputs, activation=None, name="logits" + )(E_out) + values_out = tf.keras.layers.Dense(1, activation=None, name="values")(E_out) + outs = [self._logits, values_out] + else: + outs = [E_out] + self.num_outputs = self.attention_dim + + self.trxl_model = tf.keras.Model( + inputs=[input_layer] + memory_ins, outputs=outs + memory_outs[:-1] + ) + + self.trxl_model.summary() + + # __sphinx_doc_begin__ + # Setup trajectory views (`memory-inference` x past memory outs). + for i in range(self.num_transformer_units): + space = Box(-1.0, 1.0, shape=(self.attention_dim,)) + self.view_requirements["state_in_{}".format(i)] = ViewRequirement( + "state_out_{}".format(i), + shift="-{}:-1".format(self.memory_inference), + # Repeat the incoming state every max-seq-len times. + batch_repeat_value=self.max_seq_len, + space=space, + ) + self.view_requirements["state_out_{}".format(i)] = ViewRequirement( + space=space, used_for_training=False + ) + # __sphinx_doc_end__ + + @override(ModelV2) + def forward( + self, input_dict, state: List[TensorType], seq_lens: TensorType + ) -> (TensorType, List[TensorType]): + assert seq_lens is not None + + # Add the time dim to observations. + B = tf.shape(seq_lens)[0] + observations = input_dict[SampleBatch.OBS] + + shape = tf.shape(observations) + T = shape[0] // B + observations = tf.reshape(observations, tf.concat([[-1, T], shape[1:]], axis=0)) + + all_out = self.trxl_model([observations] + state) + + if self._logits is not None: + out = tf.reshape(all_out[0], [-1, self.num_outputs]) + self._value_out = all_out[1] + memory_outs = all_out[2:] + else: + out = tf.reshape(all_out[0], [-1, self.attention_dim]) + memory_outs = all_out[1:] + + return out, [tf.reshape(m, [-1, self.attention_dim]) for m in memory_outs] + + @override(RecurrentNetwork) + def get_initial_state(self) -> List[np.ndarray]: + return [ + tf.zeros(self.view_requirements["state_in_{}".format(i)].space.shape) + for i in range(self.num_transformer_units) + ] + + @override(ModelV2) + def value_function(self) -> TensorType: + return tf.reshape(self._value_out, [-1]) + + +class AttentionWrapper(TFModelV2): + """GTrXL wrapper serving as interface for ModelV2s that set use_attention.""" + + def __init__( + self, + obs_space: gym.spaces.Space, + action_space: gym.spaces.Space, + num_outputs: int, + model_config: ModelConfigDict, + name: str, + ): + if log_once("attention_wrapper_tf_deprecation"): + deprecation_warning( + old="ray.rllib.models.tf.attention_net.AttentionWrapper" + ) + super().__init__(obs_space, action_space, None, model_config, name) + + self.use_n_prev_actions = model_config["attention_use_n_prev_actions"] + self.use_n_prev_rewards = model_config["attention_use_n_prev_rewards"] + + self.action_space_struct = get_base_struct_from_space(self.action_space) + self.action_dim = 0 + + for space in tree.flatten(self.action_space_struct): + if isinstance(space, Discrete): + self.action_dim += space.n + elif isinstance(space, MultiDiscrete): + self.action_dim += np.sum(space.nvec) + elif space.shape is not None: + self.action_dim += int(np.prod(space.shape)) + else: + self.action_dim += int(len(space)) + + # Add prev-action/reward nodes to input to LSTM. + if self.use_n_prev_actions: + self.num_outputs += self.use_n_prev_actions * self.action_dim + if self.use_n_prev_rewards: + self.num_outputs += self.use_n_prev_rewards + + cfg = model_config + + self.attention_dim = cfg["attention_dim"] + + if self.num_outputs is not None: + in_space = gym.spaces.Box( + float("-inf"), float("inf"), shape=(self.num_outputs,), dtype=np.float32 + ) + else: + in_space = obs_space + + # Construct GTrXL sub-module w/ num_outputs=None (so it does not + # create a logits/value output; we'll do this ourselves in this wrapper + # here). + self.gtrxl = GTrXLNet( + in_space, + action_space, + None, + model_config, + "gtrxl", + num_transformer_units=cfg["attention_num_transformer_units"], + attention_dim=self.attention_dim, + num_heads=cfg["attention_num_heads"], + head_dim=cfg["attention_head_dim"], + memory_inference=cfg["attention_memory_inference"], + memory_training=cfg["attention_memory_training"], + position_wise_mlp_dim=cfg["attention_position_wise_mlp_dim"], + init_gru_gate_bias=cfg["attention_init_gru_gate_bias"], + ) + + # `self.num_outputs` right now is the number of nodes coming from the + # attention net. + input_ = tf.keras.layers.Input(shape=(self.gtrxl.num_outputs,)) + + # Set final num_outputs to correct value (depending on action space). + self.num_outputs = num_outputs + + # Postprocess GTrXL output with another hidden layer and compute + # values. + out = tf.keras.layers.Dense(self.num_outputs, activation=None)(input_) + self._logits_branch = tf.keras.models.Model([input_], [out]) + + out = tf.keras.layers.Dense(1, activation=None)(input_) + self._value_branch = tf.keras.models.Model([input_], [out]) + + self.view_requirements = self.gtrxl.view_requirements + self.view_requirements["obs"].space = self.obs_space + + # Add prev-a/r to this model's view, if required. + if self.use_n_prev_actions: + self.view_requirements[SampleBatch.PREV_ACTIONS] = ViewRequirement( + SampleBatch.ACTIONS, + space=self.action_space, + shift="-{}:-1".format(self.use_n_prev_actions), + ) + if self.use_n_prev_rewards: + self.view_requirements[SampleBatch.PREV_REWARDS] = ViewRequirement( + SampleBatch.REWARDS, shift="-{}:-1".format(self.use_n_prev_rewards) + ) + + @override(RecurrentNetwork) + def forward( + self, + input_dict: Dict[str, TensorType], + state: List[TensorType], + seq_lens: TensorType, + ) -> (TensorType, List[TensorType]): + assert seq_lens is not None + # Push obs through "unwrapped" net's `forward()` first. + wrapped_out, _ = self._wrapped_forward(input_dict, [], None) + + # Concat. prev-action/reward if required. + prev_a_r = [] + + # Prev actions. + if self.use_n_prev_actions: + prev_n_actions = input_dict[SampleBatch.PREV_ACTIONS] + # If actions are not processed yet (in their original form as + # have been sent to environment): + # Flatten/one-hot into 1D array. + if self.model_config["_disable_action_flattening"]: + # Merge prev n actions into flat tensor. + flat = flatten_inputs_to_1d_tensor( + prev_n_actions, + spaces_struct=self.action_space_struct, + time_axis=True, + ) + # Fold time-axis into flattened data. + flat = tf.reshape(flat, [tf.shape(flat)[0], -1]) + prev_a_r.append(flat) + # If actions are already flattened (but not one-hot'd yet!), + # one-hot discrete/multi-discrete actions here and concatenate the + # n most recent actions together. + else: + if isinstance(self.action_space, Discrete): + for i in range(self.use_n_prev_actions): + prev_a_r.append( + one_hot(prev_n_actions[:, i], self.action_space) + ) + elif isinstance(self.action_space, MultiDiscrete): + for i in range( + 0, self.use_n_prev_actions, self.action_space.shape[0] + ): + prev_a_r.append( + one_hot( + tf.cast( + prev_n_actions[ + :, i : i + self.action_space.shape[0] + ], + tf.float32, + ), + space=self.action_space, + ) + ) + else: + prev_a_r.append( + tf.reshape( + tf.cast(prev_n_actions, tf.float32), + [-1, self.use_n_prev_actions * self.action_dim], + ) + ) + # Prev rewards. + if self.use_n_prev_rewards: + prev_a_r.append( + tf.reshape( + tf.cast(input_dict[SampleBatch.PREV_REWARDS], tf.float32), + [-1, self.use_n_prev_rewards], + ) + ) + + # Concat prev. actions + rewards to the "main" input. + if prev_a_r: + wrapped_out = tf.concat([wrapped_out] + prev_a_r, axis=1) + + # Then through our GTrXL. + input_dict["obs_flat"] = input_dict["obs"] = wrapped_out + + self._features, memory_outs = self.gtrxl(input_dict, state, seq_lens) + model_out = self._logits_branch(self._features) + return model_out, memory_outs + + @override(ModelV2) + def value_function(self) -> TensorType: + assert self._features is not None, "Must call forward() first!" + return tf.reshape(self._value_branch(self._features), [-1]) + + @override(ModelV2) + def get_initial_state(self) -> Union[List[np.ndarray], List[TensorType]]: + return [ + np.zeros(self.gtrxl.view_requirements["state_in_{}".format(i)].space.shape) + for i in range(self.gtrxl.num_transformer_units) + ] diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/models/tf/complex_input_net.py b/.venv/lib/python3.11/site-packages/ray/rllib/models/tf/complex_input_net.py new file mode 100644 index 0000000000000000000000000000000000000000..d8c41be4067a1473107a39a1f4f7ec94d8a99f27 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/rllib/models/tf/complex_input_net.py @@ -0,0 +1,214 @@ +from gymnasium.spaces import Box, Discrete, MultiDiscrete +import numpy as np +import tree # pip install dm_tree + +from ray.rllib.models.catalog import ModelCatalog +from ray.rllib.models.modelv2 import ModelV2, restore_original_dimensions +from ray.rllib.models.tf.misc import normc_initializer +from ray.rllib.models.tf.tf_modelv2 import TFModelV2 +from ray.rllib.models.utils import get_filter_config +from ray.rllib.policy.sample_batch import SampleBatch +from ray.rllib.utils.annotations import OldAPIStack, override +from ray.rllib.utils.framework import try_import_tf +from ray.rllib.utils.spaces.space_utils import flatten_space +from ray.rllib.utils.tf_utils import one_hot + +tf1, tf, tfv = try_import_tf() + + +# __sphinx_doc_begin__ +@OldAPIStack +class ComplexInputNetwork(TFModelV2): + """TFModelV2 concat'ing CNN outputs to flat input(s), followed by FC(s). + + Note: This model should be used for complex (Dict or Tuple) observation + spaces that have one or more image components. + + The data flow is as follows: + + `obs` (e.g. Tuple[img0, img1, discrete0]) -> `CNN0 + CNN1 + ONE-HOT` + `CNN0 + CNN1 + ONE-HOT` -> concat all flat outputs -> `out` + `out` -> (optional) FC-stack -> `out2` + `out2` -> action (logits) and vaulue heads. + """ + + def __init__(self, obs_space, action_space, num_outputs, model_config, name): + + self.original_space = ( + obs_space.original_space + if hasattr(obs_space, "original_space") + else obs_space + ) + + self.processed_obs_space = ( + self.original_space + if model_config.get("_disable_preprocessor_api") + else obs_space + ) + super().__init__( + self.original_space, action_space, num_outputs, model_config, name + ) + + self.flattened_input_space = flatten_space(self.original_space) + + # Build the CNN(s) given obs_space's image components. + self.cnns = {} + self.one_hot = {} + self.flatten_dims = {} + self.flatten = {} + concat_size = 0 + for i, component in enumerate(self.flattened_input_space): + # Image space. + if len(component.shape) == 3 and isinstance(component, Box): + config = { + "conv_filters": model_config["conv_filters"] + if "conv_filters" in model_config + else get_filter_config(component.shape), + "conv_activation": model_config.get("conv_activation"), + "post_fcnet_hiddens": [], + } + self.cnns[i] = ModelCatalog.get_model_v2( + component, + action_space, + num_outputs=None, + model_config=config, + framework="tf", + name="cnn_{}".format(i), + ) + concat_size += int(self.cnns[i].num_outputs) + # Discrete|MultiDiscrete inputs -> One-hot encode. + elif isinstance(component, (Discrete, MultiDiscrete)): + if isinstance(component, Discrete): + size = component.n + else: + size = np.sum(component.nvec) + config = { + "fcnet_hiddens": model_config["fcnet_hiddens"], + "fcnet_activation": model_config.get("fcnet_activation"), + "post_fcnet_hiddens": [], + } + self.one_hot[i] = ModelCatalog.get_model_v2( + Box(-1.0, 1.0, (size,), np.float32), + action_space, + num_outputs=None, + model_config=config, + framework="tf", + name="one_hot_{}".format(i), + ) + concat_size += int(self.one_hot[i].num_outputs) + # Everything else (1D Box). + else: + size = int(np.prod(component.shape)) + config = { + "fcnet_hiddens": model_config["fcnet_hiddens"], + "fcnet_activation": model_config.get("fcnet_activation"), + "post_fcnet_hiddens": [], + } + self.flatten[i] = ModelCatalog.get_model_v2( + Box(-1.0, 1.0, (size,), np.float32), + action_space, + num_outputs=None, + model_config=config, + framework="tf", + name="flatten_{}".format(i), + ) + self.flatten_dims[i] = size + concat_size += int(self.flatten[i].num_outputs) + + # Optional post-concat FC-stack. + post_fc_stack_config = { + "fcnet_hiddens": model_config.get("post_fcnet_hiddens", []), + "fcnet_activation": model_config.get("post_fcnet_activation", "relu"), + } + self.post_fc_stack = ModelCatalog.get_model_v2( + Box(float("-inf"), float("inf"), shape=(concat_size,), dtype=np.float32), + self.action_space, + None, + post_fc_stack_config, + framework="tf", + name="post_fc_stack", + ) + + # Actions and value heads. + self.logits_and_value_model = None + self._value_out = None + if num_outputs: + # Action-distribution head. + concat_layer = tf.keras.layers.Input((self.post_fc_stack.num_outputs,)) + logits_layer = tf.keras.layers.Dense( + num_outputs, + activation=None, + kernel_initializer=normc_initializer(0.01), + name="logits", + )(concat_layer) + + # Create the value branch model. + value_layer = tf.keras.layers.Dense( + 1, + activation=None, + kernel_initializer=normc_initializer(0.01), + name="value_out", + )(concat_layer) + self.logits_and_value_model = tf.keras.models.Model( + concat_layer, [logits_layer, value_layer] + ) + else: + self.num_outputs = self.post_fc_stack.num_outputs + + @override(ModelV2) + def forward(self, input_dict, state, seq_lens): + if SampleBatch.OBS in input_dict and "obs_flat" in input_dict: + orig_obs = input_dict[SampleBatch.OBS] + else: + orig_obs = restore_original_dimensions( + input_dict[SampleBatch.OBS], self.processed_obs_space, tensorlib="tf" + ) + # Push image observations through our CNNs. + outs = [] + for i, component in enumerate(tree.flatten(orig_obs)): + if i in self.cnns: + cnn_out, _ = self.cnns[i](SampleBatch({SampleBatch.OBS: component})) + outs.append(cnn_out) + elif i in self.one_hot: + if "int" in component.dtype.name: + one_hot_in = { + SampleBatch.OBS: one_hot( + component, self.flattened_input_space[i] + ) + } + else: + one_hot_in = {SampleBatch.OBS: component} + one_hot_out, _ = self.one_hot[i](SampleBatch(one_hot_in)) + outs.append(one_hot_out) + else: + nn_out, _ = self.flatten[i]( + SampleBatch( + { + SampleBatch.OBS: tf.cast( + tf.reshape(component, [-1, self.flatten_dims[i]]), + tf.float32, + ) + } + ) + ) + outs.append(nn_out) + # Concat all outputs and the non-image inputs. + out = tf.concat(outs, axis=1) + # Push through (optional) FC-stack (this may be an empty stack). + out, _ = self.post_fc_stack(SampleBatch({SampleBatch.OBS: out})) + + # No logits/value branches. + if not self.logits_and_value_model: + return out, [] + + # Logits- and value branches. + logits, values = self.logits_and_value_model(out) + self._value_out = tf.reshape(values, [-1]) + return logits, [] + + @override(ModelV2) + def value_function(self): + return self._value_out + + +# __sphinx_doc_end__ diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/models/tf/fcnet.py b/.venv/lib/python3.11/site-packages/ray/rllib/models/tf/fcnet.py new file mode 100644 index 0000000000000000000000000000000000000000..56a09de0361acf371f5d6b8ab60fc7d2790565c7 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/rllib/models/tf/fcnet.py @@ -0,0 +1,148 @@ +import numpy as np +import gymnasium as gym +from typing import Dict + +from ray.rllib.models.tf.misc import normc_initializer +from ray.rllib.models.tf.tf_modelv2 import TFModelV2 +from ray.rllib.models.utils import get_activation_fn +from ray.rllib.utils.annotations import OldAPIStack +from ray.rllib.utils.framework import try_import_tf +from ray.rllib.utils.typing import TensorType, List, ModelConfigDict + +tf1, tf, tfv = try_import_tf() + + +@OldAPIStack +class FullyConnectedNetwork(TFModelV2): + """Generic fully connected network implemented in ModelV2 API.""" + + def __init__( + self, + obs_space: gym.spaces.Space, + action_space: gym.spaces.Space, + num_outputs: int, + model_config: ModelConfigDict, + name: str, + ): + super(FullyConnectedNetwork, self).__init__( + obs_space, action_space, num_outputs, model_config, name + ) + + hiddens = list(model_config.get("fcnet_hiddens", [])) + list( + model_config.get("post_fcnet_hiddens", []) + ) + activation = model_config.get("fcnet_activation") + if not model_config.get("fcnet_hiddens", []): + activation = model_config.get("post_fcnet_activation") + activation = get_activation_fn(activation) + no_final_linear = model_config.get("no_final_linear") + vf_share_layers = model_config.get("vf_share_layers") + free_log_std = model_config.get("free_log_std") + + # Generate free-floating bias variables for the second half of + # the outputs. + if free_log_std: + assert num_outputs % 2 == 0, ( + "num_outputs must be divisible by two", + num_outputs, + ) + num_outputs = num_outputs // 2 + self.log_std_var = tf.Variable( + [0.0] * num_outputs, dtype=tf.float32, name="log_std" + ) + + # We are using obs_flat, so take the flattened shape as input. + inputs = tf.keras.layers.Input( + shape=(int(np.prod(obs_space.shape)),), name="observations" + ) + # Last hidden layer output (before logits outputs). + last_layer = inputs + # The action distribution outputs. + logits_out = None + i = 1 + + # Create layers 0 to second-last. + for size in hiddens[:-1]: + last_layer = tf.keras.layers.Dense( + size, + name="fc_{}".format(i), + activation=activation, + kernel_initializer=normc_initializer(1.0), + )(last_layer) + i += 1 + + # The last layer is adjusted to be of size num_outputs, but it's a + # layer with activation. + if no_final_linear and num_outputs: + logits_out = tf.keras.layers.Dense( + num_outputs, + name="fc_out", + activation=activation, + kernel_initializer=normc_initializer(1.0), + )(last_layer) + # Finish the layers with the provided sizes (`hiddens`), plus - + # iff num_outputs > 0 - a last linear layer of size num_outputs. + else: + if len(hiddens) > 0: + last_layer = tf.keras.layers.Dense( + hiddens[-1], + name="fc_{}".format(i), + activation=activation, + kernel_initializer=normc_initializer(1.0), + )(last_layer) + if num_outputs: + logits_out = tf.keras.layers.Dense( + num_outputs, + name="fc_out", + activation=None, + kernel_initializer=normc_initializer(0.01), + )(last_layer) + # Adjust num_outputs to be the number of nodes in the last layer. + else: + self.num_outputs = ([int(np.prod(obs_space.shape))] + hiddens[-1:])[-1] + + # Concat the log std vars to the end of the state-dependent means. + if free_log_std and logits_out is not None: + + def tiled_log_std(x): + return tf.tile(tf.expand_dims(self.log_std_var, 0), [tf.shape(x)[0], 1]) + + log_std_out = tf.keras.layers.Lambda(tiled_log_std)(inputs) + logits_out = tf.keras.layers.Concatenate(axis=1)([logits_out, log_std_out]) + + last_vf_layer = None + if not vf_share_layers: + # Build a parallel set of hidden layers for the value net. + last_vf_layer = inputs + i = 1 + for size in hiddens: + last_vf_layer = tf.keras.layers.Dense( + size, + name="fc_value_{}".format(i), + activation=activation, + kernel_initializer=normc_initializer(1.0), + )(last_vf_layer) + i += 1 + + value_out = tf.keras.layers.Dense( + 1, + name="value_out", + activation=None, + kernel_initializer=normc_initializer(0.01), + )(last_vf_layer if last_vf_layer is not None else last_layer) + + self.base_model = tf.keras.Model( + inputs, [(logits_out if logits_out is not None else last_layer), value_out] + ) + + def forward( + self, + input_dict: Dict[str, TensorType], + state: List[TensorType], + seq_lens: TensorType, + ) -> (TensorType, List[TensorType]): + model_out, self._value_out = self.base_model(input_dict["obs_flat"]) + return model_out, state + + def value_function(self) -> TensorType: + return tf.reshape(self._value_out, [-1]) diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/models/tf/misc.py b/.venv/lib/python3.11/site-packages/ray/rllib/models/tf/misc.py new file mode 100644 index 0000000000000000000000000000000000000000..7ea75e423c2d66756b9e899faa2c6487dd57cab4 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/rllib/models/tf/misc.py @@ -0,0 +1,90 @@ +import numpy as np +from typing import Tuple, Any, Optional + +from ray.rllib.utils.annotations import DeveloperAPI +from ray.rllib.utils.framework import try_import_tf +from ray.rllib.utils.typing import TensorType + +tf1, tf, tfv = try_import_tf() + + +# TODO: (sven) obsolete this class. +@DeveloperAPI +def normc_initializer(std: float = 1.0) -> Any: + def _initializer(shape, dtype=None, partition_info=None): + out = np.random.randn(*shape).astype( + dtype.name if hasattr(dtype, "name") else dtype or np.float32 + ) + out *= std / np.sqrt(np.square(out).sum(axis=0, keepdims=True)) + return tf.constant(out) + + return _initializer + + +@DeveloperAPI +def conv2d( + x: TensorType, + num_filters: int, + name: str, + filter_size: Tuple[int, int] = (3, 3), + stride: Tuple[int, int] = (1, 1), + pad: str = "SAME", + dtype: Optional[Any] = None, + collections: Optional[Any] = None, +) -> TensorType: + + if dtype is None: + dtype = tf.float32 + + with tf1.variable_scope(name): + stride_shape = [1, stride[0], stride[1], 1] + filter_shape = [ + filter_size[0], + filter_size[1], + int(x.get_shape()[3]), + num_filters, + ] + + # There are "num input feature maps * filter height * filter width" + # inputs to each hidden unit. + fan_in = np.prod(filter_shape[:3]) + # Each unit in the lower layer receives a gradient from: "num output + # feature maps * filter height * filter width" / pooling size. + fan_out = np.prod(filter_shape[:2]) * num_filters + # Initialize weights with random weights. + w_bound = np.sqrt(6 / (fan_in + fan_out)) + + w = tf1.get_variable( + "W", + filter_shape, + dtype, + tf1.random_uniform_initializer(-w_bound, w_bound), + collections=collections, + ) + b = tf1.get_variable( + "b", + [1, 1, 1, num_filters], + initializer=tf1.constant_initializer(0.0), + collections=collections, + ) + return tf1.nn.conv2d(x, w, stride_shape, pad) + b + + +@DeveloperAPI +def linear( + x: TensorType, + size: int, + name: str, + initializer: Optional[Any] = None, + bias_init: float = 0.0, +) -> TensorType: + w = tf1.get_variable(name + "/w", [x.get_shape()[1], size], initializer=initializer) + b = tf1.get_variable( + name + "/b", [size], initializer=tf1.constant_initializer(bias_init) + ) + return tf.matmul(x, w) + b + + +@DeveloperAPI +def flatten(x: TensorType) -> TensorType: + return tf.reshape(x, [-1, np.prod(x.get_shape().as_list()[1:])]) diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/models/tf/recurrent_net.py b/.venv/lib/python3.11/site-packages/ray/rllib/models/tf/recurrent_net.py new file mode 100644 index 0000000000000000000000000000000000000000..2010d4a901188a53cec5d2766e865392c1d7f9d0 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/rllib/models/tf/recurrent_net.py @@ -0,0 +1,292 @@ +import numpy as np +import gymnasium as gym +from gymnasium.spaces import Discrete, MultiDiscrete +import logging +import tree # pip install dm_tree +from typing import Dict, List, Tuple + +from ray.rllib.models.modelv2 import ModelV2 +from ray.rllib.models.tf.tf_modelv2 import TFModelV2 +from ray.rllib.policy.rnn_sequencing import add_time_dimension +from ray.rllib.policy.sample_batch import SampleBatch +from ray.rllib.policy.view_requirement import ViewRequirement +from ray.rllib.utils.annotations import OldAPIStack, override +from ray.rllib.utils.framework import try_import_tf +from ray.rllib.utils.spaces.space_utils import get_base_struct_from_space +from ray.rllib.utils.tf_utils import flatten_inputs_to_1d_tensor, one_hot +from ray.rllib.utils.typing import ModelConfigDict, TensorType +from ray.rllib.utils.deprecation import deprecation_warning +from ray.util.debug import log_once + +tf1, tf, tfv = try_import_tf() +logger = logging.getLogger(__name__) + + +@OldAPIStack +class RecurrentNetwork(TFModelV2): + """Helper class to simplify implementing RNN models with TFModelV2. + + Instead of implementing forward(), you can implement forward_rnn() which + takes batches with the time dimension added already. + + Here is an example implementation for a subclass + ``MyRNNClass(RecurrentNetwork)``:: + + def __init__(self, *args, **kwargs): + super(MyModelClass, self).__init__(*args, **kwargs) + cell_size = 256 + + # Define input layers + input_layer = tf.keras.layers.Input( + shape=(None, obs_space.shape[0])) + state_in_h = tf.keras.layers.Input(shape=(256, )) + state_in_c = tf.keras.layers.Input(shape=(256, )) + seq_in = tf.keras.layers.Input(shape=(), dtype=tf.int32) + + # Send to LSTM cell + lstm_out, state_h, state_c = tf.keras.layers.LSTM( + cell_size, return_sequences=True, return_state=True, + name="lstm")( + inputs=input_layer, + mask=tf.sequence_mask(seq_in), + initial_state=[state_in_h, state_in_c]) + output_layer = tf.keras.layers.Dense(...)(lstm_out) + + # Create the RNN model + self.rnn_model = tf.keras.Model( + inputs=[input_layer, seq_in, state_in_h, state_in_c], + outputs=[output_layer, state_h, state_c]) + self.rnn_model.summary() + """ + + @override(ModelV2) + def forward( + self, + input_dict: Dict[str, TensorType], + state: List[TensorType], + seq_lens: TensorType, + ) -> Tuple[TensorType, List[TensorType]]: + """Adds time dimension to batch before sending inputs to forward_rnn(). + + You should implement forward_rnn() in your subclass.""" + # Creating a __init__ function that acts as a passthrough and adding the warning + # there led to errors probably due to the multiple inheritance. We encountered + # the same error if we add the Deprecated decorator. We therefore add the + # deprecation warning here. + if log_once("recurrent_network_tf"): + deprecation_warning( + old="ray.rllib.models.tf.recurrent_net.RecurrentNetwork" + ) + assert seq_lens is not None + flat_inputs = input_dict["obs_flat"] + inputs = add_time_dimension( + padded_inputs=flat_inputs, seq_lens=seq_lens, framework="tf" + ) + output, new_state = self.forward_rnn( + inputs, + state, + seq_lens, + ) + return tf.reshape(output, [-1, self.num_outputs]), new_state + + def forward_rnn( + self, inputs: TensorType, state: List[TensorType], seq_lens: TensorType + ) -> Tuple[TensorType, List[TensorType]]: + """Call the model with the given input tensors and state. + + Args: + inputs: observation tensor with shape [B, T, obs_size]. + state: list of state tensors, each with shape [B, T, size]. + seq_lens: 1d tensor holding input sequence lengths. + + Returns: + (outputs, new_state): The model output tensor of shape + [B, T, num_outputs] and the list of new state tensors each with + shape [B, size]. + + Sample implementation for the ``MyRNNClass`` example:: + + def forward_rnn(self, inputs, state, seq_lens): + model_out, h, c = self.rnn_model([inputs, seq_lens] + state) + return model_out, [h, c] + """ + raise NotImplementedError("You must implement this for a RNN model") + + def get_initial_state(self) -> List[TensorType]: + """Get the initial recurrent state values for the model. + + Returns: + list of np.array objects, if any + + Sample implementation for the ``MyRNNClass`` example:: + + def get_initial_state(self): + return [ + np.zeros(self.cell_size, np.float32), + np.zeros(self.cell_size, np.float32), + ] + """ + raise NotImplementedError("You must implement this for a RNN model") + + +@OldAPIStack +class LSTMWrapper(RecurrentNetwork): + """An LSTM wrapper serving as an interface for ModelV2s that set use_lstm.""" + + def __init__( + self, + obs_space: gym.spaces.Space, + action_space: gym.spaces.Space, + num_outputs: int, + model_config: ModelConfigDict, + name: str, + ): + super(LSTMWrapper, self).__init__( + obs_space, action_space, None, model_config, name + ) + # At this point, self.num_outputs is the number of nodes coming + # from the wrapped (underlying) model. In other words, self.num_outputs + # is the input size for the LSTM layer. + # If None, set it to the observation space. + if self.num_outputs is None: + self.num_outputs = int(np.prod(self.obs_space.shape)) + + self.cell_size = model_config["lstm_cell_size"] + self.use_prev_action = model_config["lstm_use_prev_action"] + self.use_prev_reward = model_config["lstm_use_prev_reward"] + + self.action_space_struct = get_base_struct_from_space(self.action_space) + self.action_dim = 0 + + for space in tree.flatten(self.action_space_struct): + if isinstance(space, Discrete): + self.action_dim += space.n + elif isinstance(space, MultiDiscrete): + self.action_dim += np.sum(space.nvec) + elif space.shape is not None: + self.action_dim += int(np.prod(space.shape)) + else: + self.action_dim += int(len(space)) + + # Add prev-action/reward nodes to input to LSTM. + if self.use_prev_action: + self.num_outputs += self.action_dim + if self.use_prev_reward: + self.num_outputs += 1 + + # Define input layers. + input_layer = tf.keras.layers.Input( + shape=(None, self.num_outputs), name="inputs" + ) + + # Set self.num_outputs to the number of output nodes desired by the + # caller of this constructor. + self.num_outputs = num_outputs + + state_in_h = tf.keras.layers.Input(shape=(self.cell_size,), name="h") + state_in_c = tf.keras.layers.Input(shape=(self.cell_size,), name="c") + seq_in = tf.keras.layers.Input(shape=(), name="seq_in", dtype=tf.int32) + + # Preprocess observation with a hidden layer and send to LSTM cell + lstm_out, state_h, state_c = tf.keras.layers.LSTM( + self.cell_size, return_sequences=True, return_state=True, name="lstm" + )( + inputs=input_layer, + mask=tf.sequence_mask(seq_in), + initial_state=[state_in_h, state_in_c], + ) + + # Postprocess LSTM output with another hidden layer and compute values + logits = tf.keras.layers.Dense( + self.num_outputs, activation=tf.keras.activations.linear, name="logits" + )(lstm_out) + values = tf.keras.layers.Dense(1, activation=None, name="values")(lstm_out) + + # Create the RNN model + self._rnn_model = tf.keras.Model( + inputs=[input_layer, seq_in, state_in_h, state_in_c], + outputs=[logits, values, state_h, state_c], + ) + # Print out model summary in INFO logging mode. + if logger.isEnabledFor(logging.INFO): + self._rnn_model.summary() + + # Add prev-a/r to this model's view, if required. + if model_config["lstm_use_prev_action"]: + self.view_requirements[SampleBatch.PREV_ACTIONS] = ViewRequirement( + SampleBatch.ACTIONS, space=self.action_space, shift=-1 + ) + if model_config["lstm_use_prev_reward"]: + self.view_requirements[SampleBatch.PREV_REWARDS] = ViewRequirement( + SampleBatch.REWARDS, shift=-1 + ) + + @override(RecurrentNetwork) + def forward( + self, + input_dict: Dict[str, TensorType], + state: List[TensorType], + seq_lens: TensorType, + ) -> Tuple[TensorType, List[TensorType]]: + assert seq_lens is not None + # Push obs through "unwrapped" net's `forward()` first. + wrapped_out, _ = self._wrapped_forward(input_dict, [], None) + + # Concat. prev-action/reward if required. + prev_a_r = [] + + # Prev actions. + if self.model_config["lstm_use_prev_action"]: + prev_a = input_dict[SampleBatch.PREV_ACTIONS] + # If actions are not processed yet (in their original form as + # have been sent to environment): + # Flatten/one-hot into 1D array. + if self.model_config["_disable_action_flattening"]: + prev_a_r.append( + flatten_inputs_to_1d_tensor( + prev_a, + spaces_struct=self.action_space_struct, + time_axis=False, + ) + ) + # If actions are already flattened (but not one-hot'd yet!), + # one-hot discrete/multi-discrete actions here. + else: + if isinstance(self.action_space, (Discrete, MultiDiscrete)): + prev_a = one_hot(prev_a, self.action_space) + prev_a_r.append( + tf.reshape(tf.cast(prev_a, tf.float32), [-1, self.action_dim]) + ) + # Prev rewards. + if self.model_config["lstm_use_prev_reward"]: + prev_a_r.append( + tf.reshape( + tf.cast(input_dict[SampleBatch.PREV_REWARDS], tf.float32), [-1, 1] + ) + ) + + # Concat prev. actions + rewards to the "main" input. + if prev_a_r: + wrapped_out = tf.concat([wrapped_out] + prev_a_r, axis=1) + + # Push everything through our LSTM. + input_dict["obs_flat"] = wrapped_out + return super().forward(input_dict, state, seq_lens) + + @override(RecurrentNetwork) + def forward_rnn( + self, inputs: TensorType, state: List[TensorType], seq_lens: TensorType + ) -> Tuple[TensorType, List[TensorType]]: + model_out, self._value_out, h, c = self._rnn_model([inputs, seq_lens] + state) + return model_out, [h, c] + + @override(ModelV2) + def get_initial_state(self) -> List[np.ndarray]: + return [ + np.zeros(self.cell_size, np.float32), + np.zeros(self.cell_size, np.float32), + ] + + @override(ModelV2) + def value_function(self) -> TensorType: + return tf.reshape(self._value_out, [-1]) diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/models/tf/tf_action_dist.py b/.venv/lib/python3.11/site-packages/ray/rllib/models/tf/tf_action_dist.py new file mode 100644 index 0000000000000000000000000000000000000000..683d1939776d3f346ec5f52e97ca5b95e049cdb6 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/rllib/models/tf/tf_action_dist.py @@ -0,0 +1,735 @@ +import functools +import gymnasium as gym +from math import log +import numpy as np +import tree # pip install dm_tree +from typing import Optional + +from ray.rllib.models.action_dist import ActionDistribution +from ray.rllib.models.modelv2 import ModelV2 +from ray.rllib.utils import MIN_LOG_NN_OUTPUT, MAX_LOG_NN_OUTPUT, SMALL_NUMBER +from ray.rllib.utils.annotations import OldAPIStack, override +from ray.rllib.utils.framework import try_import_tf, try_import_tfp +from ray.rllib.utils.spaces.space_utils import get_base_struct_from_space +from ray.rllib.utils.typing import TensorType, List, Union, Tuple, ModelConfigDict + +tf1, tf, tfv = try_import_tf() +tfp = try_import_tfp() + + +@OldAPIStack +class TFActionDistribution(ActionDistribution): + """TF-specific extensions for building action distributions.""" + + @override(ActionDistribution) + def __init__(self, inputs: List[TensorType], model: ModelV2): + super().__init__(inputs, model) + self.sample_op = self._build_sample_op() + self.sampled_action_logp_op = self.logp(self.sample_op) + + def _build_sample_op(self) -> TensorType: + """Implement this instead of sample(), to enable op reuse. + + This is needed since the sample op is non-deterministic and is shared + between sample() and sampled_action_logp(). + """ + raise NotImplementedError + + @override(ActionDistribution) + def sample(self) -> TensorType: + """Draw a sample from the action distribution.""" + return self.sample_op + + @override(ActionDistribution) + def sampled_action_logp(self) -> TensorType: + """Returns the log probability of the sampled action.""" + return self.sampled_action_logp_op + + +@OldAPIStack +class Categorical(TFActionDistribution): + """Categorical distribution for discrete action spaces.""" + + def __init__( + self, inputs: List[TensorType], model: ModelV2 = None, temperature: float = 1.0 + ): + assert temperature > 0.0, "Categorical `temperature` must be > 0.0!" + # Allow softmax formula w/ temperature != 1.0: + # Divide inputs by temperature. + super().__init__(inputs / temperature, model) + + @override(ActionDistribution) + def deterministic_sample(self) -> TensorType: + return tf.math.argmax(self.inputs, axis=1) + + @override(ActionDistribution) + def logp(self, x: TensorType) -> TensorType: + return -tf.nn.sparse_softmax_cross_entropy_with_logits( + logits=self.inputs, labels=tf.cast(x, tf.int32) + ) + + @override(ActionDistribution) + def entropy(self) -> TensorType: + a0 = self.inputs - tf.reduce_max(self.inputs, axis=1, keepdims=True) + ea0 = tf.exp(a0) + z0 = tf.reduce_sum(ea0, axis=1, keepdims=True) + p0 = ea0 / z0 + return tf.reduce_sum(p0 * (tf.math.log(z0) - a0), axis=1) + + @override(ActionDistribution) + def kl(self, other: ActionDistribution) -> TensorType: + a0 = self.inputs - tf.reduce_max(self.inputs, axis=1, keepdims=True) + a1 = other.inputs - tf.reduce_max(other.inputs, axis=1, keepdims=True) + ea0 = tf.exp(a0) + ea1 = tf.exp(a1) + z0 = tf.reduce_sum(ea0, axis=1, keepdims=True) + z1 = tf.reduce_sum(ea1, axis=1, keepdims=True) + p0 = ea0 / z0 + return tf.reduce_sum(p0 * (a0 - tf.math.log(z0) - a1 + tf.math.log(z1)), axis=1) + + @override(TFActionDistribution) + def _build_sample_op(self) -> TensorType: + return tf.squeeze(tf.random.categorical(self.inputs, 1), axis=1) + + @staticmethod + @override(ActionDistribution) + def required_model_output_shape(action_space, model_config): + return action_space.n + + +@OldAPIStack +def get_categorical_class_with_temperature(t: float): + """Categorical distribution class that has customized default temperature.""" + + class CategoricalWithTemperature(Categorical): + def __init__(self, inputs, model=None, temperature=t): + super().__init__(inputs, model, temperature) + + return CategoricalWithTemperature + + +@OldAPIStack +class MultiCategorical(TFActionDistribution): + """MultiCategorical distribution for MultiDiscrete action spaces.""" + + def __init__( + self, + inputs: List[TensorType], + model: ModelV2, + input_lens: Union[List[int], np.ndarray, Tuple[int, ...]], + action_space=None, + ): + # skip TFActionDistribution init + ActionDistribution.__init__(self, inputs, model) + self.cats = [ + Categorical(input_, model) + for input_ in tf.split(inputs, input_lens, axis=1) + ] + self.action_space = action_space + if self.action_space is None: + self.action_space = gym.spaces.MultiDiscrete( + [c.inputs.shape[1] for c in self.cats] + ) + self.sample_op = self._build_sample_op() + self.sampled_action_logp_op = self.logp(self.sample_op) + + @override(ActionDistribution) + def deterministic_sample(self) -> TensorType: + sample_ = tf.stack([cat.deterministic_sample() for cat in self.cats], axis=1) + if isinstance(self.action_space, gym.spaces.Box): + return tf.cast( + tf.reshape(sample_, [-1] + list(self.action_space.shape)), + self.action_space.dtype, + ) + return sample_ + + @override(ActionDistribution) + def logp(self, actions: TensorType) -> TensorType: + # If tensor is provided, unstack it into list. + if isinstance(actions, tf.Tensor): + if isinstance(self.action_space, gym.spaces.Box): + actions = tf.reshape( + actions, [-1, int(np.prod(self.action_space.shape))] + ) + elif isinstance(self.action_space, gym.spaces.MultiDiscrete): + actions.set_shape((None, len(self.cats))) + actions = tf.unstack(tf.cast(actions, tf.int32), axis=1) + logps = tf.stack([cat.logp(act) for cat, act in zip(self.cats, actions)]) + return tf.reduce_sum(logps, axis=0) + + @override(ActionDistribution) + def multi_entropy(self) -> TensorType: + return tf.stack([cat.entropy() for cat in self.cats], axis=1) + + @override(ActionDistribution) + def entropy(self) -> TensorType: + return tf.reduce_sum(self.multi_entropy(), axis=1) + + @override(ActionDistribution) + def multi_kl(self, other: ActionDistribution) -> TensorType: + return tf.stack( + [cat.kl(oth_cat) for cat, oth_cat in zip(self.cats, other.cats)], axis=1 + ) + + @override(ActionDistribution) + def kl(self, other: ActionDistribution) -> TensorType: + return tf.reduce_sum(self.multi_kl(other), axis=1) + + @override(TFActionDistribution) + def _build_sample_op(self) -> TensorType: + sample_op = tf.stack([cat.sample() for cat in self.cats], axis=1) + if isinstance(self.action_space, gym.spaces.Box): + return tf.cast( + tf.reshape(sample_op, [-1] + list(self.action_space.shape)), + dtype=self.action_space.dtype, + ) + return sample_op + + @staticmethod + @override(ActionDistribution) + def required_model_output_shape( + action_space: gym.Space, model_config: ModelConfigDict + ) -> Union[int, np.ndarray]: + # Int Box. + if isinstance(action_space, gym.spaces.Box): + assert action_space.dtype.name.startswith("int") + low_ = np.min(action_space.low) + high_ = np.max(action_space.high) + assert np.all(action_space.low == low_) + assert np.all(action_space.high == high_) + return np.prod(action_space.shape, dtype=np.int32) * (high_ - low_ + 1) + # MultiDiscrete space. + else: + # nvec is already integer, so no casting needed. + return np.sum(action_space.nvec) + + +@OldAPIStack +class SlateMultiCategorical(Categorical): + """MultiCategorical distribution for MultiDiscrete action spaces. + + The action space must be uniform, meaning all nvec items have the same size, e.g. + MultiDiscrete([10, 10, 10]), where 10 is the number of candidates to pick from + and 3 is the slate size (pick 3 out of 10). When picking candidates, no candidate + must be picked more than once. + """ + + def __init__( + self, + inputs: List[TensorType], + model: ModelV2 = None, + temperature: float = 1.0, + action_space: Optional[gym.spaces.MultiDiscrete] = None, + all_slates=None, + ): + assert temperature > 0.0, "Categorical `temperature` must be > 0.0!" + # Allow softmax formula w/ temperature != 1.0: + # Divide inputs by temperature. + super().__init__(inputs / temperature, model) + self.action_space = action_space + # Assert uniformness of the action space (all discrete buckets have the same + # size). + assert isinstance(self.action_space, gym.spaces.MultiDiscrete) and all( + n == self.action_space.nvec[0] for n in self.action_space.nvec + ) + self.all_slates = all_slates + + @override(ActionDistribution) + def deterministic_sample(self) -> TensorType: + # Get a sample from the underlying Categorical (batch of ints). + sample = super().deterministic_sample() + # Use the sampled ints to pick the actual slates. + return tf.gather(self.all_slates, sample) + + @override(ActionDistribution) + def logp(self, x: TensorType) -> TensorType: + # TODO: Implement. + return tf.ones_like(self.inputs[:, 0]) + + +@OldAPIStack +class GumbelSoftmax(TFActionDistribution): + """GumbelSoftmax distr. (for differentiable sampling in discr. actions + + The Gumbel Softmax distribution [1] (also known as the Concrete [2] + distribution) is a close cousin of the relaxed one-hot categorical + distribution, whose tfp implementation we will use here plus + adjusted `sample_...` and `log_prob` methods. See discussion at [0]. + + [0] https://stackoverflow.com/questions/56226133/ + soft-actor-critic-with-discrete-action-space + + [1] Categorical Reparametrization with Gumbel-Softmax (Jang et al, 2017): + https://arxiv.org/abs/1611.01144 + [2] The Concrete Distribution: A Continuous Relaxation of Discrete Random + Variables (Maddison et al, 2017) https://arxiv.org/abs/1611.00712 + """ + + def __init__( + self, inputs: List[TensorType], model: ModelV2 = None, temperature: float = 1.0 + ): + """Initializes a GumbelSoftmax distribution. + + Args: + temperature: Temperature parameter. For low temperatures, + the expected value approaches a categorical random variable. + For high temperatures, the expected value approaches a uniform + distribution. + """ + assert temperature >= 0.0 + self.dist = tfp.distributions.RelaxedOneHotCategorical( + temperature=temperature, logits=inputs + ) + self.probs = tf.nn.softmax(self.dist._distribution.logits) + super().__init__(inputs, model) + + @override(ActionDistribution) + def deterministic_sample(self) -> TensorType: + # Return the dist object's prob values. + return self.probs + + @override(ActionDistribution) + def logp(self, x: TensorType) -> TensorType: + # Override since the implementation of tfp.RelaxedOneHotCategorical + # yields positive values. + if x.shape != self.dist.logits.shape: + values = tf.one_hot( + x, self.dist.logits.shape.as_list()[-1], dtype=tf.float32 + ) + assert values.shape == self.dist.logits.shape, ( + values.shape, + self.dist.logits.shape, + ) + + # [0]'s implementation (see line below) seems to be an approximation + # to the actual Gumbel Softmax density. + return -tf.reduce_sum( + -x * tf.nn.log_softmax(self.dist.logits, axis=-1), axis=-1 + ) + + @override(TFActionDistribution) + def _build_sample_op(self) -> TensorType: + return self.dist.sample() + + @staticmethod + @override(ActionDistribution) + def required_model_output_shape( + action_space: gym.Space, model_config: ModelConfigDict + ) -> Union[int, np.ndarray]: + return action_space.n + + +@OldAPIStack +class DiagGaussian(TFActionDistribution): + """Action distribution where each vector element is a gaussian. + + The first half of the input vector defines the gaussian means, and the + second half the gaussian standard deviations. + """ + + def __init__( + self, + inputs: List[TensorType], + model: ModelV2, + *, + action_space: Optional[gym.spaces.Space] = None + ): + mean, log_std = tf.split(inputs, 2, axis=1) + self.mean = mean + self.log_std = log_std + self.std = tf.exp(log_std) + # Remember to squeeze action samples in case action space is Box(shape) + self.zero_action_dim = action_space and action_space.shape == () + super().__init__(inputs, model) + + @override(ActionDistribution) + def deterministic_sample(self) -> TensorType: + return self.mean + + @override(ActionDistribution) + def logp(self, x: TensorType) -> TensorType: + # Cover case where action space is Box(shape=()). + if int(tf.shape(x).shape[0]) == 1: + x = tf.expand_dims(x, axis=1) + return ( + -0.5 + * tf.reduce_sum( + tf.math.square((tf.cast(x, tf.float32) - self.mean) / self.std), axis=1 + ) + - 0.5 * np.log(2.0 * np.pi) * tf.cast(tf.shape(x)[1], tf.float32) + - tf.reduce_sum(self.log_std, axis=1) + ) + + @override(ActionDistribution) + def kl(self, other: ActionDistribution) -> TensorType: + assert isinstance(other, DiagGaussian) + return tf.reduce_sum( + other.log_std + - self.log_std + + (tf.math.square(self.std) + tf.math.square(self.mean - other.mean)) + / (2.0 * tf.math.square(other.std)) + - 0.5, + axis=1, + ) + + @override(ActionDistribution) + def entropy(self) -> TensorType: + return tf.reduce_sum(self.log_std + 0.5 * np.log(2.0 * np.pi * np.e), axis=1) + + @override(TFActionDistribution) + def _build_sample_op(self) -> TensorType: + sample = self.mean + self.std * tf.random.normal(tf.shape(self.mean)) + if self.zero_action_dim: + return tf.squeeze(sample, axis=-1) + return sample + + @staticmethod + @override(ActionDistribution) + def required_model_output_shape( + action_space: gym.Space, model_config: ModelConfigDict + ) -> Union[int, np.ndarray]: + return np.prod(action_space.shape, dtype=np.int32) * 2 + + +@OldAPIStack +class SquashedGaussian(TFActionDistribution): + """A tanh-squashed Gaussian distribution defined by: mean, std, low, high. + + The distribution will never return low or high exactly, but + `low`+SMALL_NUMBER or `high`-SMALL_NUMBER respectively. + """ + + def __init__( + self, + inputs: List[TensorType], + model: ModelV2, + low: float = -1.0, + high: float = 1.0, + ): + """Parameterizes the distribution via `inputs`. + + Args: + low: The lowest possible sampling value + (excluding this value). + high: The highest possible sampling value + (excluding this value). + """ + assert tfp is not None + mean, log_std = tf.split(inputs, 2, axis=-1) + # Clip `scale` values (coming from NN) to reasonable values. + log_std = tf.clip_by_value(log_std, MIN_LOG_NN_OUTPUT, MAX_LOG_NN_OUTPUT) + std = tf.exp(log_std) + self.distr = tfp.distributions.Normal(loc=mean, scale=std) + assert np.all(np.less(low, high)) + self.low = low + self.high = high + super().__init__(inputs, model) + + @override(ActionDistribution) + def deterministic_sample(self) -> TensorType: + mean = self.distr.mean() + return self._squash(mean) + + @override(TFActionDistribution) + def _build_sample_op(self) -> TensorType: + return self._squash(self.distr.sample()) + + @override(ActionDistribution) + def logp(self, x: TensorType) -> TensorType: + # Unsquash values (from [low,high] to ]-inf,inf[) + unsquashed_values = tf.cast(self._unsquash(x), self.inputs.dtype) + # Get log prob of unsquashed values from our Normal. + log_prob_gaussian = self.distr.log_prob(unsquashed_values) + # For safety reasons, clamp somehow, only then sum up. + log_prob_gaussian = tf.clip_by_value(log_prob_gaussian, -100, 100) + log_prob_gaussian = tf.reduce_sum(log_prob_gaussian, axis=-1) + # Get log-prob for squashed Gaussian. + unsquashed_values_tanhd = tf.math.tanh(unsquashed_values) + log_prob = log_prob_gaussian - tf.reduce_sum( + tf.math.log(1 - unsquashed_values_tanhd**2 + SMALL_NUMBER), axis=-1 + ) + return log_prob + + def sample_logp(self): + z = self.distr.sample() + actions = self._squash(z) + return actions, tf.reduce_sum( + self.distr.log_prob(z) - tf.math.log(1 - actions * actions + SMALL_NUMBER), + axis=-1, + ) + + @override(ActionDistribution) + def entropy(self) -> TensorType: + raise ValueError("Entropy not defined for SquashedGaussian!") + + @override(ActionDistribution) + def kl(self, other: ActionDistribution) -> TensorType: + raise ValueError("KL not defined for SquashedGaussian!") + + def _squash(self, raw_values: TensorType) -> TensorType: + # Returned values are within [low, high] (including `low` and `high`). + squashed = ((tf.math.tanh(raw_values) + 1.0) / 2.0) * ( + self.high - self.low + ) + self.low + return tf.clip_by_value(squashed, self.low, self.high) + + def _unsquash(self, values: TensorType) -> TensorType: + normed_values = (values - self.low) / (self.high - self.low) * 2.0 - 1.0 + # Stabilize input to atanh. + save_normed_values = tf.clip_by_value( + normed_values, -1.0 + SMALL_NUMBER, 1.0 - SMALL_NUMBER + ) + unsquashed = tf.math.atanh(save_normed_values) + return unsquashed + + @staticmethod + @override(ActionDistribution) + def required_model_output_shape( + action_space: gym.Space, model_config: ModelConfigDict + ) -> Union[int, np.ndarray]: + return np.prod(action_space.shape, dtype=np.int32) * 2 + + +@OldAPIStack +class Beta(TFActionDistribution): + """ + A Beta distribution is defined on the interval [0, 1] and parameterized by + shape parameters alpha and beta (also called concentration parameters). + + PDF(x; alpha, beta) = x**(alpha - 1) (1 - x)**(beta - 1) / Z + with Z = Gamma(alpha) Gamma(beta) / Gamma(alpha + beta) + and Gamma(n) = (n - 1)! + """ + + def __init__( + self, + inputs: List[TensorType], + model: ModelV2, + low: float = 0.0, + high: float = 1.0, + ): + # Stabilize input parameters (possibly coming from a linear layer). + inputs = tf.clip_by_value(inputs, log(SMALL_NUMBER), -log(SMALL_NUMBER)) + inputs = tf.math.log(tf.math.exp(inputs) + 1.0) + 1.0 + self.low = low + self.high = high + alpha, beta = tf.split(inputs, 2, axis=-1) + # Note: concentration0==beta, concentration1=alpha (!) + self.dist = tfp.distributions.Beta(concentration1=alpha, concentration0=beta) + super().__init__(inputs, model) + + @override(ActionDistribution) + def deterministic_sample(self) -> TensorType: + mean = self.dist.mean() + return self._squash(mean) + + @override(TFActionDistribution) + def _build_sample_op(self) -> TensorType: + return self._squash(self.dist.sample()) + + @override(ActionDistribution) + def logp(self, x: TensorType) -> TensorType: + unsquashed_values = self._unsquash(x) + return tf.math.reduce_sum(self.dist.log_prob(unsquashed_values), axis=-1) + + def _squash(self, raw_values: TensorType) -> TensorType: + return raw_values * (self.high - self.low) + self.low + + def _unsquash(self, values: TensorType) -> TensorType: + return (values - self.low) / (self.high - self.low) + + @staticmethod + @override(ActionDistribution) + def required_model_output_shape( + action_space: gym.Space, model_config: ModelConfigDict + ) -> Union[int, np.ndarray]: + return np.prod(action_space.shape, dtype=np.int32) * 2 + + +@OldAPIStack +class Deterministic(TFActionDistribution): + """Action distribution that returns the input values directly. + + This is similar to DiagGaussian with standard deviation zero (thus only + requiring the "mean" values as NN output). + """ + + @override(ActionDistribution) + def deterministic_sample(self) -> TensorType: + return self.inputs + + @override(TFActionDistribution) + def logp(self, x: TensorType) -> TensorType: + return tf.zeros_like(self.inputs) + + @override(TFActionDistribution) + def _build_sample_op(self) -> TensorType: + return self.inputs + + @staticmethod + @override(ActionDistribution) + def required_model_output_shape( + action_space: gym.Space, model_config: ModelConfigDict + ) -> Union[int, np.ndarray]: + return np.prod(action_space.shape, dtype=np.int32) + + +@OldAPIStack +class MultiActionDistribution(TFActionDistribution): + """Action distribution that operates on a set of actions. + + Args: + inputs (Tensor list): A list of tensors from which to compute samples. + """ + + def __init__( + self, inputs, model, *, child_distributions, input_lens, action_space, **kwargs + ): + ActionDistribution.__init__(self, inputs, model) + + self.action_space_struct = get_base_struct_from_space(action_space) + + self.input_lens = np.array(input_lens, dtype=np.int32) + split_inputs = tf.split(inputs, self.input_lens, axis=1) + self.flat_child_distributions = tree.map_structure( + lambda dist, input_: dist(input_, model, **kwargs), + child_distributions, + split_inputs, + ) + + @override(ActionDistribution) + def logp(self, x): + # Single tensor input (all merged). + if isinstance(x, (tf.Tensor, np.ndarray)): + split_indices = [] + for dist in self.flat_child_distributions: + if isinstance(dist, Categorical): + split_indices.append(1) + elif ( + isinstance(dist, MultiCategorical) and dist.action_space is not None + ): + split_indices.append(np.prod(dist.action_space.shape)) + else: + sample = dist.sample() + # Cover Box(shape=()) case. + if len(sample.shape) == 1: + split_indices.append(1) + else: + split_indices.append(tf.shape(sample)[1]) + split_x = tf.split(x, split_indices, axis=1) + # Structured or flattened (by single action component) input. + else: + split_x = tree.flatten(x) + + def map_(val, dist): + # Remove extra categorical dimension. + if isinstance(dist, Categorical): + val = tf.cast( + tf.squeeze(val, axis=-1) if len(val.shape) > 1 else val, tf.int32 + ) + return dist.logp(val) + + # Remove extra categorical dimension and take the logp of each + # component. + flat_logps = tree.map_structure(map_, split_x, self.flat_child_distributions) + + return functools.reduce(lambda a, b: a + b, flat_logps) + + @override(ActionDistribution) + def kl(self, other): + kl_list = [ + d.kl(o) + for d, o in zip( + self.flat_child_distributions, other.flat_child_distributions + ) + ] + return functools.reduce(lambda a, b: a + b, kl_list) + + @override(ActionDistribution) + def entropy(self): + entropy_list = [d.entropy() for d in self.flat_child_distributions] + return functools.reduce(lambda a, b: a + b, entropy_list) + + @override(ActionDistribution) + def sample(self): + child_distributions = tree.unflatten_as( + self.action_space_struct, self.flat_child_distributions + ) + return tree.map_structure(lambda s: s.sample(), child_distributions) + + @override(ActionDistribution) + def deterministic_sample(self): + child_distributions = tree.unflatten_as( + self.action_space_struct, self.flat_child_distributions + ) + return tree.map_structure( + lambda s: s.deterministic_sample(), child_distributions + ) + + @override(TFActionDistribution) + def sampled_action_logp(self): + p = self.flat_child_distributions[0].sampled_action_logp() + for c in self.flat_child_distributions[1:]: + p += c.sampled_action_logp() + return p + + @override(ActionDistribution) + def required_model_output_shape(self, action_space, model_config): + return np.sum(self.input_lens, dtype=np.int32) + + +@OldAPIStack +class Dirichlet(TFActionDistribution): + """Dirichlet distribution for continuous actions that are between + [0,1] and sum to 1. + + e.g. actions that represent resource allocation.""" + + def __init__(self, inputs: List[TensorType], model: ModelV2): + """Input is a tensor of logits. The exponential of logits is used to + parametrize the Dirichlet distribution as all parameters need to be + positive. An arbitrary small epsilon is added to the concentration + parameters to be zero due to numerical error. + + See issue #4440 for more details. + """ + self.epsilon = 1e-7 + concentration = tf.exp(inputs) + self.epsilon + self.dist = tf1.distributions.Dirichlet( + concentration=concentration, + validate_args=True, + allow_nan_stats=False, + ) + super().__init__(concentration, model) + + @override(ActionDistribution) + def deterministic_sample(self) -> TensorType: + return tf.nn.softmax(self.dist.concentration) + + @override(ActionDistribution) + def logp(self, x: TensorType) -> TensorType: + # Support of Dirichlet are positive real numbers. x is already + # an array of positive numbers, but we clip to avoid zeros due to + # numerical errors. + x = tf.maximum(x, self.epsilon) + x = x / tf.reduce_sum(x, axis=-1, keepdims=True) + return self.dist.log_prob(x) + + @override(ActionDistribution) + def entropy(self) -> TensorType: + return self.dist.entropy() + + @override(ActionDistribution) + def kl(self, other: ActionDistribution) -> TensorType: + return self.dist.kl_divergence(other.dist) + + @override(TFActionDistribution) + def _build_sample_op(self) -> TensorType: + return self.dist.sample() + + @staticmethod + @override(ActionDistribution) + def required_model_output_shape( + action_space: gym.Space, model_config: ModelConfigDict + ) -> Union[int, np.ndarray]: + return np.prod(action_space.shape, dtype=np.int32) diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/models/tf/tf_distributions.py b/.venv/lib/python3.11/site-packages/ray/rllib/models/tf/tf_distributions.py new file mode 100644 index 0000000000000000000000000000000000000000..a99898f53e7f63b4c6a6d2e9ceb95dfc041f940c --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/rllib/models/tf/tf_distributions.py @@ -0,0 +1,552 @@ +"""The main difference between this and the old ActionDistribution is that this one +has more explicit input args. So that the input format does not have to be guessed from +the code. This matches the design pattern of torch distribution which developers may +already be familiar with. +""" +import gymnasium as gym +import tree +import numpy as np +from typing import Dict, Iterable, List, Optional +import abc + + +from ray.rllib.models.distributions import Distribution +from ray.rllib.utils.annotations import override, DeveloperAPI +from ray.rllib.utils.framework import try_import_tf, try_import_tfp +from ray.rllib.utils.typing import TensorType, Union, Tuple + + +_, tf, _ = try_import_tf() +tfp = try_import_tfp() + +# TODO (Kourosh) Write unittest for this class similar to torch distributions. + + +@DeveloperAPI +class TfDistribution(Distribution, abc.ABC): + """Wrapper class for tfp.distributions.""" + + def __init__(self, *args, **kwargs): + super().__init__() + self._dist = self._get_tf_distribution(*args, **kwargs) + + @abc.abstractmethod + def _get_tf_distribution(self, *args, **kwargs) -> "tfp.distributions.Distribution": + """Returns the tfp.distributions.Distribution object to use.""" + + @override(Distribution) + def logp(self, value: TensorType, **kwargs) -> TensorType: + return self._dist.log_prob(value, **kwargs) + + @override(Distribution) + def entropy(self) -> TensorType: + return self._dist.entropy() + + @override(Distribution) + def kl(self, other: "Distribution") -> TensorType: + return self._dist.kl_divergence(other._dist) + + @override(Distribution) + def sample( + self, *, sample_shape=() + ) -> Union[TensorType, Tuple[TensorType, TensorType]]: + sample = self._dist.sample(sample_shape) + return sample + + @override(Distribution) + def rsample( + self, *, sample_shape=() + ) -> Union[TensorType, Tuple[TensorType, TensorType]]: + raise NotImplementedError + + +@DeveloperAPI +class TfCategorical(TfDistribution): + """Wrapper class for Categorical distribution. + + Creates a categorical distribution parameterized by either :attr:`probs` or + :attr:`logits` (but not both). + + Samples are integers from :math:`\{0, \ldots, K-1\}` where `K` is + ``probs.size(-1)``. + + If `probs` is 1-dimensional with length-`K`, each element is the relative + probability of sampling the class at that index. + + If `probs` is N-dimensional, the first N-1 dimensions are treated as a batch of + relative probability vectors. + + .. testcode:: + :skipif: True + + m = TfCategorical([ 0.25, 0.25, 0.25, 0.25 ]) + m.sample(sample_shape=(2,)) # equal probability of 0, 1, 2, 3 + + .. testoutput:: + + tf.Tensor([2 3], shape=(2,), dtype=int32) + + Args: + probs: The probablities of each event. + logits: Event log probabilities (unnormalized) + temperature: In case of using logits, this parameter can be used to determine + the sharpness of the distribution. i.e. + ``probs = softmax(logits / temperature)``. The temperature must be strictly + positive. A low value (e.g. 1e-10) will result in argmax sampling while a + larger value will result in uniform sampling. + """ + + @override(TfDistribution) + def __init__( + self, + probs: "tf.Tensor" = None, + logits: "tf.Tensor" = None, + ) -> None: + # We assert this here because to_deterministic makes this assumption. + assert (probs is None) != ( + logits is None + ), "Exactly one out of `probs` and `logits` must be set!" + + self.probs = probs + self.logits = logits + self.one_hot = tfp.distributions.OneHotCategorical(logits=logits, probs=probs) + super().__init__(logits=logits, probs=probs) + + @override(Distribution) + def logp(self, value: TensorType, **kwargs) -> TensorType: + # This prevents an error in which float values at the boundaries of the range + # of the distribution are passed to this function. + return -tf.nn.sparse_softmax_cross_entropy_with_logits( + logits=self.logits if self.logits is not None else tf.log(self.probs), + labels=tf.cast(value, tf.int32), + ) + + @override(TfDistribution) + def _get_tf_distribution( + self, + probs: "tf.Tensor" = None, + logits: "tf.Tensor" = None, + ) -> "tfp.distributions.Distribution": + return tfp.distributions.Categorical(probs=probs, logits=logits) + + @staticmethod + @override(Distribution) + def required_input_dim(space: gym.Space, **kwargs) -> int: + assert isinstance(space, gym.spaces.Discrete) + return int(space.n) + + @override(Distribution) + def rsample(self, sample_shape=()): + one_hot_sample = self.one_hot.sample(sample_shape) + return tf.stop_gradients(one_hot_sample - self.probs) + self.probs + + @classmethod + @override(Distribution) + def from_logits(cls, logits: TensorType, **kwargs) -> "TfCategorical": + return TfCategorical(logits=logits, **kwargs) + + def to_deterministic(self) -> "TfDeterministic": + if self.probs is not None: + probs_or_logits = self.probs + else: + probs_or_logits = self.logits + + return TfDeterministic(loc=tf.math.argmax(probs_or_logits, axis=-1)) + + +@DeveloperAPI +class TfDiagGaussian(TfDistribution): + """Wrapper class for Normal distribution. + + Creates a normal distribution parameterized by :attr:`loc` and :attr:`scale`. In + case of multi-dimensional distribution, the variance is assumed to be diagonal. + + .. testcode:: + :skipif: True + + m = TfDiagGaussian(loc=[0.0, 0.0], scale=[1.0, 1.0]) + m.sample(sample_shape=(2,)) # 2d normal dist with loc=0 and scale=1 + + .. testoutput:: + + tensor([[ 0.1046, -0.6120], [ 0.234, 0.556]]) + + .. testcode:: + :skipif: True + + # scale is None + m = TfDiagGaussian(loc=[0.0, 1.0]) + m.sample(sample_shape=(2,)) # normally distributed with loc=0 and scale=1 + + .. testoutput:: + + tensor([0.1046, 0.6120]) + + + Args: + loc: mean of the distribution (often referred to as mu). If scale is None, the + second half of the `loc` will be used as the log of scale. + scale: standard deviation of the distribution (often referred to as sigma). + Has to be positive. + """ + + @override(TfDistribution) + def __init__( + self, + loc: Union[float, TensorType], + scale: Optional[Union[float, TensorType]] = None, + ): + self.loc = loc + super().__init__(loc=loc, scale=scale) + + @override(TfDistribution) + def _get_tf_distribution(self, loc, scale) -> "tfp.distributions.Distribution": + return tfp.distributions.Normal(loc=loc, scale=scale) + + @override(TfDistribution) + def logp(self, value: TensorType) -> TensorType: + return tf.math.reduce_sum(super().logp(value), axis=-1) + + @override(TfDistribution) + def entropy(self) -> TensorType: + return tf.math.reduce_sum(super().entropy(), axis=-1) + + @override(TfDistribution) + def kl(self, other: "TfDistribution") -> TensorType: + return tf.math.reduce_sum(super().kl(other), axis=-1) + + @staticmethod + @override(Distribution) + def required_input_dim(space: gym.Space, **kwargs) -> int: + assert isinstance(space, gym.spaces.Box) + return int(np.prod(space.shape, dtype=np.int32) * 2) + + @override(Distribution) + def rsample(self, sample_shape=()): + eps = tf.random.normal(sample_shape) + return self._dist.loc + eps * self._dist.scale + + @classmethod + @override(Distribution) + def from_logits(cls, logits: TensorType, **kwargs) -> "TfDiagGaussian": + loc, log_std = tf.split(logits, num_or_size_splits=2, axis=-1) + scale = tf.math.exp(log_std) + return TfDiagGaussian(loc=loc, scale=scale) + + def to_deterministic(self) -> "TfDeterministic": + return TfDeterministic(loc=self.loc) + + +@DeveloperAPI +class TfDeterministic(Distribution): + """The distribution that returns the input values directly. + + This is similar to DiagGaussian with standard deviation zero (thus only + requiring the "mean" values as NN output). + + Note: entropy is always zero, ang logp and kl are not implemented. + + .. testcode:: + :skipif: True + + m = TfDeterministic(loc=tf.constant([0.0, 0.0])) + m.sample(sample_shape=(2,)) + + .. testoutput:: + + Tensor([[ 0.0, 0.0], [ 0.0, 0.0]]) + + Args: + loc: the determinsitic value to return + """ + + @override(Distribution) + def __init__(self, loc: "tf.Tensor") -> None: + super().__init__() + self.loc = loc + + @override(Distribution) + def sample( + self, + *, + sample_shape: Tuple[int, ...] = (), + **kwargs, + ) -> Union[TensorType, Tuple[TensorType, TensorType]]: + shape = sample_shape + self.loc.shape + return tf.ones(shape, dtype=self.loc.dtype) * self.loc + + @override(Distribution) + def rsample( + self, + *, + sample_shape: Tuple[int, ...] = None, + **kwargs, + ) -> Union[TensorType, Tuple[TensorType, TensorType]]: + raise NotImplementedError + + @override(Distribution) + def logp(self, value: TensorType, **kwargs) -> TensorType: + return tf.zeros_like(self.loc) + + @override(Distribution) + def entropy(self, **kwargs) -> TensorType: + raise RuntimeError(f"`entropy()` not supported for {self.__class__.__name__}.") + + @override(Distribution) + def kl(self, other: "Distribution", **kwargs) -> TensorType: + raise RuntimeError(f"`kl()` not supported for {self.__class__.__name__}.") + + @staticmethod + @override(Distribution) + def required_input_dim(space: gym.Space, **kwargs) -> int: + assert isinstance(space, gym.spaces.Box) + return int(np.prod(space.shape, dtype=np.int32)) + + @classmethod + @override(Distribution) + def from_logits(cls, logits: TensorType, **kwargs) -> "TfDeterministic": + return TfDeterministic(loc=logits) + + def to_deterministic(self) -> "TfDeterministic": + return self + + +@DeveloperAPI +class TfMultiCategorical(Distribution): + """MultiCategorical distribution for MultiDiscrete action spaces.""" + + @override(Distribution) + def __init__( + self, + categoricals: List[TfCategorical], + ): + super().__init__() + self._cats = categoricals + + @override(Distribution) + def sample(self) -> TensorType: + arr = [cat.sample() for cat in self._cats] + sample_ = tf.stack(arr, axis=-1) + return sample_ + + @override(Distribution) + def rsample(self, sample_shape=()): + arr = [cat.rsample() for cat in self._cats] + sample_ = tf.stack(arr, axis=-1) + return sample_ + + @override(Distribution) + def logp(self, value: tf.Tensor) -> TensorType: + actions = tf.unstack(tf.cast(value, tf.int32), axis=-1) + logps = tf.stack([cat.logp(act) for cat, act in zip(self._cats, actions)]) + return tf.reduce_sum(logps, axis=0) + + @override(Distribution) + def entropy(self) -> TensorType: + return tf.reduce_sum( + tf.stack([cat.entropy() for cat in self._cats], axis=-1), axis=-1 + ) + + @override(Distribution) + def kl(self, other: Distribution) -> TensorType: + kls = tf.stack( + [cat.kl(oth_cat) for cat, oth_cat in zip(self._cats, other._cats)], axis=-1 + ) + return tf.reduce_sum(kls, axis=-1) + + @staticmethod + @override(Distribution) + def required_input_dim(space: gym.Space, **kwargs) -> int: + assert isinstance(space, gym.spaces.MultiDiscrete) + return int(np.sum(space.nvec)) + + @classmethod + @override(Distribution) + def from_logits( + cls, + logits: tf.Tensor, + input_lens: List[int], + **kwargs, + ) -> "TfMultiCategorical": + """Creates this Distribution from logits (and additional arguments). + + If you wish to create this distribution from logits only, please refer to + `Distribution.get_partial_dist_cls()`. + + Args: + logits: The tensor containing logits to be separated by logit_lens. + child_distribution_cls_struct: A struct of Distribution classes that can + be instantiated from the given logits. + input_lens: A list of integers that indicate the length of the logits + vectors to be passed into each child distribution. + **kwargs: Forward compatibility kwargs. + """ + categoricals = [ + TfCategorical(logits=logits) + for logits in tf.split(logits, input_lens, axis=-1) + ] + + return TfMultiCategorical(categoricals=categoricals) + + def to_deterministic(self) -> "TfMultiDistribution": + return TfMultiDistribution([cat.to_deterministic() for cat in self._cats]) + + +@DeveloperAPI +class TfMultiDistribution(Distribution): + """Action distribution that operates on multiple, possibly nested actions.""" + + def __init__( + self, + child_distribution_struct: Union[Tuple, List, Dict], + ): + """Initializes a TfMultiDistribution object. + + Args: + child_distribution_struct: Any struct + that contains the child distribution classes to use to + instantiate the child distributions from `logits`. + """ + super().__init__() + self._original_struct = child_distribution_struct + self._flat_child_distributions = tree.flatten(child_distribution_struct) + + @override(Distribution) + def rsample( + self, + *, + sample_shape: Tuple[int, ...] = None, + **kwargs, + ) -> Union[TensorType, Tuple[TensorType, TensorType]]: + rsamples = [] + for dist in self._flat_child_distributions: + rsample = dist.rsample(sample_shape=sample_shape, **kwargs) + rsamples.append(rsample) + + rsamples = tree.unflatten_as(self._original_struct, rsamples) + return rsamples + + @override(Distribution) + def logp(self, value): + # Single tensor input (all merged). + if isinstance(value, (tf.Tensor, np.ndarray)): + split_indices = [] + for dist in self._flat_child_distributions: + if isinstance(dist, TfCategorical): + split_indices.append(1) + elif isinstance(dist, TfMultiCategorical): + split_indices.append(len(dist._cats)) + else: + sample = dist.sample() + # Cover Box(shape=()) case. + if len(sample.shape) == 1: + split_indices.append(1) + else: + split_indices.append(tf.shape(sample)[1]) + split_value = tf.split(value, split_indices, axis=1) + # Structured or flattened (by single action component) input. + else: + split_value = tree.flatten(value) + + def map_(val, dist): + # Remove extra dimension if present. + if ( + isinstance(dist, TfCategorical) + and len(val.shape) > 1 + and val.shape[-1] == 1 + ): + val = tf.squeeze(val, axis=-1) + + return dist.logp(val) + + # Remove extra categorical dimension and take the logp of each + # component. + flat_logps = tree.map_structure( + map_, split_value, self._flat_child_distributions + ) + + return sum(flat_logps) + + @override(Distribution) + def kl(self, other): + kl_list = [ + d.kl(o) + for d, o in zip( + self._flat_child_distributions, other._flat_child_distributions + ) + ] + return sum(kl_list) + + @override(Distribution) + def entropy(self): + entropy_list = [d.entropy() for d in self._flat_child_distributions] + return sum(entropy_list) + + @override(Distribution) + def sample(self): + child_distributions_struct = tree.unflatten_as( + self._original_struct, self._flat_child_distributions + ) + return tree.map_structure(lambda s: s.sample(), child_distributions_struct) + + @staticmethod + @override(Distribution) + def required_input_dim(space: gym.Space, input_lens: List[int], **kwargs) -> int: + return sum(input_lens) + + @classmethod + @override(Distribution) + def from_logits( + cls, + logits: tf.Tensor, + child_distribution_cls_struct: Union[Dict, Iterable], + input_lens: Union[Dict, List[int]], + space: gym.Space, + **kwargs, + ) -> "TfMultiDistribution": + """Creates this Distribution from logits (and additional arguments). + + If you wish to create this distribution from logits only, please refer to + `Distribution.get_partial_dist_cls()`. + + Args: + logits: The tensor containing logits to be separated by `input_lens`. + child_distribution_cls_struct: A struct of Distribution classes that can + be instantiated from the given logits. + child_distribution_cls_struct: A struct of Distribution classes that can + be instantiated from the given logits. + input_lens: A list or dict of integers that indicate the length of each + logit. If this is given as a dict, the structure should match the + structure of child_distribution_cls_struct. + space: The possibly nested output space. + **kwargs: Forward compatibility kwargs. + + Returns: + A TfMultiDistribution object. + """ + logit_lens = tree.flatten(input_lens) + child_distribution_cls_list = tree.flatten(child_distribution_cls_struct) + split_logits = tf.split(logits, logit_lens, axis=1) + + child_distribution_list = tree.map_structure( + lambda dist, input_: dist.from_logits(input_), + child_distribution_cls_list, + list(split_logits), + ) + + child_distribution_struct = tree.unflatten_as( + child_distribution_cls_struct, child_distribution_list + ) + + return TfMultiDistribution( + child_distribution_struct=child_distribution_struct, + ) + + def to_deterministic(self) -> "TfMultiDistribution": + flat_deterministic_dists = [ + dist.to_deterministic for dist in self._flat_child_distributions + ] + deterministic_dists = tree.unflatten_as( + self._original_struct, flat_deterministic_dists + ) + return TfMultiDistribution(deterministic_dists) diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/models/tf/tf_modelv2.py b/.venv/lib/python3.11/site-packages/ray/rllib/models/tf/tf_modelv2.py new file mode 100644 index 0000000000000000000000000000000000000000..7438796944248b443b0e3c91332275745a1ab467 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/rllib/models/tf/tf_modelv2.py @@ -0,0 +1,142 @@ +import contextlib +import gymnasium as gym +import re +from typing import Dict, List, Union + +from ray.util import log_once +from ray.rllib.models.modelv2 import ModelV2 +from ray.rllib.utils.annotations import OldAPIStack, override +from ray.rllib.utils.deprecation import deprecation_warning +from ray.rllib.utils.framework import try_import_tf +from ray.rllib.utils.typing import ModelConfigDict, TensorType + +tf1, tf, tfv = try_import_tf() + + +@OldAPIStack +class TFModelV2(ModelV2): + """TF version of ModelV2, which should contain a tf keras Model. + + Note that this class by itself is not a valid model unless you + implement forward() in a subclass.""" + + def __init__( + self, + obs_space: gym.spaces.Space, + action_space: gym.spaces.Space, + num_outputs: int, + model_config: ModelConfigDict, + name: str, + ): + """Initializes a TFModelV2 instance. + + Here is an example implementation for a subclass + ``MyModelClass(TFModelV2)``:: + + def __init__(self, *args, **kwargs): + super(MyModelClass, self).__init__(*args, **kwargs) + input_layer = tf.keras.layers.Input(...) + hidden_layer = tf.keras.layers.Dense(...)(input_layer) + output_layer = tf.keras.layers.Dense(...)(hidden_layer) + value_layer = tf.keras.layers.Dense(...)(hidden_layer) + self.base_model = tf.keras.Model( + input_layer, [output_layer, value_layer]) + """ + super().__init__( + obs_space, action_space, num_outputs, model_config, name, framework="tf" + ) + + # Deprecated: TFModelV2 now automatically track their variables. + self.var_list = [] + + if tf1.executing_eagerly(): + self.graph = None + else: + self.graph = tf1.get_default_graph() + + def context(self) -> contextlib.AbstractContextManager: + """Returns a contextmanager for the current TF graph.""" + if self.graph: + return self.graph.as_default() + else: + return ModelV2.context(self) + + def update_ops(self) -> List[TensorType]: + """Return the list of update ops for this model. + + For example, this should include any BatchNorm update ops.""" + return [] + + def register_variables(self, variables: List[TensorType]) -> None: + """Register the given list of variables with this model.""" + if log_once("deprecated_tfmodelv2_register_variables"): + deprecation_warning(old="TFModelV2.register_variables", error=False) + self.var_list.extend(variables) + + @override(ModelV2) + def variables( + self, as_dict: bool = False + ) -> Union[List[TensorType], Dict[str, TensorType]]: + if as_dict: + # Old way using `register_variables`. + if self.var_list: + return {v.name: v for v in self.var_list} + # New way: Automatically determine the var tree. + else: + return self._find_sub_modules("", self.__dict__) + + # Old way using `register_variables`. + if self.var_list: + return list(self.var_list) + # New way: Automatically determine the var tree. + else: + return list(self.variables(as_dict=True).values()) + + @override(ModelV2) + def trainable_variables( + self, as_dict: bool = False + ) -> Union[List[TensorType], Dict[str, TensorType]]: + if as_dict: + return { + k: v for k, v in self.variables(as_dict=True).items() if v.trainable + } + return [v for v in self.variables() if v.trainable] + + @staticmethod + def _find_sub_modules(current_key, struct): + # Keras Model: key=k + "." + var-name (replace '/' by '.'). + if isinstance(struct, tf.keras.models.Model) or isinstance(struct, tf.Module): + ret = {} + for var in struct.variables: + name = re.sub("/", ".", var.name) + key = current_key + "." + name + ret[key] = var + return ret + # Other TFModelV2: Include its vars into ours. + elif isinstance(struct, TFModelV2): + return { + current_key + "." + key: var + for key, var in struct.variables(as_dict=True).items() + } + # tf.Variable + elif isinstance(struct, tf.Variable): + return {current_key: struct} + # List/Tuple. + elif isinstance(struct, (tuple, list)): + ret = {} + for i, value in enumerate(struct): + sub_vars = TFModelV2._find_sub_modules( + current_key + "_{}".format(i), value + ) + ret.update(sub_vars) + return ret + # Dict. + elif isinstance(struct, dict): + if current_key: + current_key += "_" + ret = {} + for key, value in struct.items(): + sub_vars = TFModelV2._find_sub_modules(current_key + str(key), value) + ret.update(sub_vars) + return ret + return {} diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/models/tf/visionnet.py b/.venv/lib/python3.11/site-packages/ray/rllib/models/tf/visionnet.py new file mode 100644 index 0000000000000000000000000000000000000000..69124c9e2e61ef48272dfcbae99503db13a98b07 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/rllib/models/tf/visionnet.py @@ -0,0 +1,264 @@ +import gymnasium as gym +from typing import Dict, List + +from ray.rllib.models.tf.tf_modelv2 import TFModelV2 +from ray.rllib.models.tf.misc import normc_initializer +from ray.rllib.models.utils import get_activation_fn, get_filter_config +from ray.rllib.utils.annotations import OldAPIStack +from ray.rllib.utils.framework import try_import_tf +from ray.rllib.utils.typing import ModelConfigDict, TensorType + +tf1, tf, tfv = try_import_tf() + + +@OldAPIStack +class VisionNetwork(TFModelV2): + """Generic vision network implemented in ModelV2 API. + + An additional post-conv fully connected stack can be added and configured + via the config keys: + `post_fcnet_hiddens`: Dense layer sizes after the Conv2D stack. + `post_fcnet_activation`: Activation function to use for this FC stack. + """ + + def __init__( + self, + obs_space: gym.spaces.Space, + action_space: gym.spaces.Space, + num_outputs: int, + model_config: ModelConfigDict, + name: str, + ): + if not model_config.get("conv_filters"): + model_config["conv_filters"] = get_filter_config(obs_space.shape) + + super(VisionNetwork, self).__init__( + obs_space, action_space, num_outputs, model_config, name + ) + + activation = get_activation_fn( + self.model_config.get("conv_activation"), framework="tf" + ) + filters = self.model_config["conv_filters"] + assert len(filters) > 0, "Must provide at least 1 entry in `conv_filters`!" + + # Post FC net config. + post_fcnet_hiddens = model_config.get("post_fcnet_hiddens", []) + post_fcnet_activation = get_activation_fn( + model_config.get("post_fcnet_activation"), framework="tf" + ) + + no_final_linear = self.model_config.get("no_final_linear") + vf_share_layers = self.model_config.get("vf_share_layers") + + input_shape = obs_space.shape + self.data_format = "channels_last" + + inputs = tf.keras.layers.Input(shape=input_shape, name="observations") + last_layer = inputs + # Whether the last layer is the output of a Flattened (rather than + # a n x (1,1) Conv2D). + self.last_layer_is_flattened = False + + # Build the action layers + for i, (out_size, kernel, stride) in enumerate(filters[:-1], 1): + last_layer = tf.keras.layers.Conv2D( + out_size, + kernel, + strides=stride + if isinstance(stride, (list, tuple)) + else (stride, stride), + activation=activation, + padding="same", + data_format="channels_last", + name="conv{}".format(i), + )(last_layer) + + out_size, kernel, stride = filters[-1] + + # No final linear: Last layer has activation function and exits with + # num_outputs nodes (this could be a 1x1 conv or a FC layer, depending + # on `post_fcnet_...` settings). + if no_final_linear and num_outputs: + last_layer = tf.keras.layers.Conv2D( + out_size if post_fcnet_hiddens else num_outputs, + kernel, + strides=stride + if isinstance(stride, (list, tuple)) + else (stride, stride), + activation=activation, + padding="valid", + data_format="channels_last", + name="conv_out", + )(last_layer) + # Add (optional) post-fc-stack after last Conv2D layer. + layer_sizes = post_fcnet_hiddens[:-1] + ( + [num_outputs] if post_fcnet_hiddens else [] + ) + feature_out = last_layer + + for i, out_size in enumerate(layer_sizes): + feature_out = last_layer + last_layer = tf.keras.layers.Dense( + out_size, + name="post_fcnet_{}".format(i), + activation=post_fcnet_activation, + kernel_initializer=normc_initializer(1.0), + )(last_layer) + + # Finish network normally (w/o overriding last layer size with + # `num_outputs`), then add another linear one of size `num_outputs`. + else: + last_layer = tf.keras.layers.Conv2D( + out_size, + kernel, + strides=stride + if isinstance(stride, (list, tuple)) + else (stride, stride), + activation=activation, + padding="valid", + data_format="channels_last", + name="conv{}".format(len(filters)), + )(last_layer) + + # num_outputs defined. Use that to create an exact + # `num_output`-sized (1,1)-Conv2D. + if num_outputs: + if post_fcnet_hiddens: + last_cnn = last_layer = tf.keras.layers.Conv2D( + post_fcnet_hiddens[0], + [1, 1], + activation=post_fcnet_activation, + padding="same", + data_format="channels_last", + name="conv_out", + )(last_layer) + # Add (optional) post-fc-stack after last Conv2D layer. + for i, out_size in enumerate( + post_fcnet_hiddens[1:] + [num_outputs] + ): + feature_out = last_layer + last_layer = tf.keras.layers.Dense( + out_size, + name="post_fcnet_{}".format(i + 1), + activation=post_fcnet_activation + if i < len(post_fcnet_hiddens) - 1 + else None, + kernel_initializer=normc_initializer(1.0), + )(last_layer) + else: + feature_out = last_layer + last_cnn = last_layer = tf.keras.layers.Conv2D( + num_outputs, + [1, 1], + activation=None, + padding="same", + data_format="channels_last", + name="conv_out", + )(last_layer) + + if last_cnn.shape[1] != 1 or last_cnn.shape[2] != 1: + raise ValueError( + "Given `conv_filters` ({}) do not result in a [B, 1, " + "1, {} (`num_outputs`)] shape (but in {})! Please " + "adjust your Conv2D stack such that the dims 1 and 2 " + "are both 1.".format( + self.model_config["conv_filters"], + self.num_outputs, + list(last_cnn.shape), + ) + ) + + # num_outputs not known -> Flatten, then set self.num_outputs + # to the resulting number of nodes. + else: + self.last_layer_is_flattened = True + last_layer = tf.keras.layers.Flatten(data_format="channels_last")( + last_layer + ) + + # Add (optional) post-fc-stack after last Conv2D layer. + for i, out_size in enumerate(post_fcnet_hiddens): + last_layer = tf.keras.layers.Dense( + out_size, + name="post_fcnet_{}".format(i), + activation=post_fcnet_activation, + kernel_initializer=normc_initializer(1.0), + )(last_layer) + feature_out = last_layer + self.num_outputs = last_layer.shape[1] + logits_out = last_layer + + # Build the value layers + if vf_share_layers: + if not self.last_layer_is_flattened: + feature_out = tf.keras.layers.Lambda( + lambda x: tf.squeeze(x, axis=[1, 2]) + )(feature_out) + value_out = tf.keras.layers.Dense( + 1, + name="value_out", + activation=None, + kernel_initializer=normc_initializer(0.01), + )(feature_out) + else: + # build a parallel set of hidden layers for the value net + last_layer = inputs + for i, (out_size, kernel, stride) in enumerate(filters[:-1], 1): + last_layer = tf.keras.layers.Conv2D( + out_size, + kernel, + strides=stride + if isinstance(stride, (list, tuple)) + else (stride, stride), + activation=activation, + padding="same", + data_format="channels_last", + name="conv_value_{}".format(i), + )(last_layer) + out_size, kernel, stride = filters[-1] + last_layer = tf.keras.layers.Conv2D( + out_size, + kernel, + strides=stride + if isinstance(stride, (list, tuple)) + else (stride, stride), + activation=activation, + padding="valid", + data_format="channels_last", + name="conv_value_{}".format(len(filters)), + )(last_layer) + last_layer = tf.keras.layers.Conv2D( + 1, + [1, 1], + activation=None, + padding="same", + data_format="channels_last", + name="conv_value_out", + )(last_layer) + value_out = tf.keras.layers.Lambda(lambda x: tf.squeeze(x, axis=[1, 2]))( + last_layer + ) + + self.base_model = tf.keras.Model(inputs, [logits_out, value_out]) + + def forward( + self, + input_dict: Dict[str, TensorType], + state: List[TensorType], + seq_lens: TensorType, + ) -> (TensorType, List[TensorType]): + obs = input_dict["obs"] + if self.data_format == "channels_first": + obs = tf.transpose(obs, [0, 2, 3, 1]) + # Explicit cast to float32 needed in eager. + model_out, self._value_out = self.base_model(tf.cast(obs, tf.float32)) + # Our last layer is already flat. + if self.last_layer_is_flattened: + return model_out, state + # Last layer is a n x [1,1] Conv2D -> Flatten. + else: + return tf.squeeze(model_out, axis=[1, 2]), state + + def value_function(self) -> TensorType: + return tf.reshape(self._value_out, [-1]) diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/models/utils.py b/.venv/lib/python3.11/site-packages/ray/rllib/models/utils.py new file mode 100644 index 0000000000000000000000000000000000000000..c57b94bbfd1887dfa33cf19fdfe1e1ca48889d20 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/rllib/models/utils.py @@ -0,0 +1,280 @@ +from typing import Callable, Optional, Union + +from ray.rllib.utils.annotations import DeveloperAPI +from ray.rllib.utils.framework import try_import_jax, try_import_tf, try_import_torch + + +@DeveloperAPI +def get_activation_fn( + name: Optional[Union[Callable, str]] = None, + framework: str = "tf", +): + """Returns a framework specific activation function, given a name string. + + Args: + name: One of "relu" (default), "tanh", "elu", + "swish" (or "silu", which is the same), or "linear" (same as None). + framework: One of "jax", "tf|tf2" or "torch". + + Returns: + A framework-specific activtion function. e.g. tf.nn.tanh or + torch.nn.ReLU. None if name in ["linear", None]. + + Raises: + ValueError: If name is an unknown activation function. + """ + # Already a callable, return as-is. + if callable(name): + return name + + name_lower = name.lower() if isinstance(name, str) else name + + # Infer the correct activation function from the string specifier. + if framework == "torch": + if name_lower in ["linear", None]: + return None + + _, nn = try_import_torch() + # First try getting the correct activation function from nn directly. + # Note that torch activation functions are not all lower case. + fn = getattr(nn, name, None) + if fn is not None: + return fn + + if name_lower in ["swish", "silu"]: + return nn.SiLU + elif name_lower == "relu": + return nn.ReLU + elif name_lower == "tanh": + return nn.Tanh + elif name_lower == "elu": + return nn.ELU + elif framework == "jax": + if name_lower in ["linear", None]: + return None + jax, _ = try_import_jax() + if name_lower in ["swish", "silu"]: + return jax.nn.swish + if name_lower == "relu": + return jax.nn.relu + elif name_lower == "tanh": + return jax.nn.hard_tanh + elif name_lower == "elu": + return jax.nn.elu + else: + assert framework in ["tf", "tf2"], "Unsupported framework `{}`!".format( + framework + ) + if name_lower in ["linear", None]: + return None + + tf1, tf, tfv = try_import_tf() + # Try getting the correct activation function from tf.nn directly. + # Note that tf activation functions are all lower case, so this should always + # work. + fn = getattr(tf.nn, name_lower, None) + + if fn is not None: + return fn + + raise ValueError( + "Unknown activation ({}) for framework={}!".format(name, framework) + ) + + +@DeveloperAPI +def get_initializer_fn(name: Optional[Union[str, Callable]], framework: str = "torch"): + """Returns the framework-specific initializer class or function. + + This function relies fully on the specified initializer classes and + functions in the frameworks `torch` and `tf2` (see for `torch` + https://pytorch.org/docs/stable/nn.init.html and for `tf2` see + https://www.tensorflow.org/api_docs/python/tf/keras/initializers). + + Note, for framework `torch` the in-place initializers are needed, i.e. names + should end with an underscore `_`, e.g. `glorot_uniform_`. + + Args: + name: Name of the initializer class or function in one of the two + supported frameworks, i.e. `torch` or `tf2`. + framework: The framework string, either `torch or `tf2`. + + Returns: + A framework-specific function or class defining an initializer to be used + for network initialization, + + Raises: + `ValueError` if the `name` is neither class or function in the specified + `framework`. Raises also a `ValueError`, if `name` does not define an + in-place initializer for framework `torch`. + """ + # Already a callable or `None` return as is. If `None` we use the default + # initializer defined in the framework-specific layers themselves. + if callable(name) or name is None: + return name + + if framework == "torch": + name_lower = name.lower() if isinstance(name, str) else name + + _, nn = try_import_torch() + + # Check, if the name includes an underscore. We must use the + # in-place initialization from Torch. + if not name_lower.endswith("_"): + raise ValueError( + "Not an in-place initializer: Torch weight initializers " + "need to be provided as their in-place version, i.e. " + " + '_'. See " + "https://pytorch.org/docs/stable/nn.init.html. " + f"User provided {name}." + ) + + # First, try to get the initialization directly from `nn.init`. + # Note, that all initialization methods in `nn.init` are lower + # case and that `_` defines the "in-place" method. + fn = getattr(nn.init, name_lower, None) + if fn is not None: + # TODO (simon): Raise a warning if not "in-place" method. + return fn + # Unknown initializer. + else: + # Inform the user that this initializer does not exist. + raise ValueError( + f"Unknown initializer name: {name_lower} is not a method in " + "`torch.nn.init`!" + ) + elif framework == "tf2": + # Note, as initializer classes in TensorFlow can be either given by their + # name in camel toe typing or by their shortcut we use the `name` as it is. + # See https://www.tensorflow.org/api_docs/python/tf/keras/initializers. + + _, tf, _ = try_import_tf() + + # Try to get the initialization function directly from `tf.keras.initializers`. + fn = getattr(tf.keras.initializers, name, None) + if fn is not None: + return fn + # Unknown initializer. + else: + # Inform the user that this initializer does not exist. + raise ValueError( + f"Unknown initializer: {name} is not a initializer in " + "`tf.keras.initializers`!" + ) + + +@DeveloperAPI +def get_filter_config(shape): + """Returns a default Conv2D filter config (list) for a given image shape. + + Args: + shape (Tuple[int]): The input (image) shape, e.g. (84,84,3). + + Returns: + List[list]: The Conv2D filter configuration usable as `conv_filters` + inside a model config dict. + """ + # 96x96x3 (e.g. CarRacing-v0). + filters_96x96 = [ + [16, [8, 8], 4], + [32, [4, 4], 2], + [256, [11, 11], 2], + ] + # Atari. + filters_84x84 = [ + [16, [8, 8], 4], + [32, [4, 4], 2], + [256, [11, 11], 1], + ] + # Dreamer-style (S-sized model) Atari or DM Control Suite. + filters_64x64 = [ + [32, [4, 4], 2], + [64, [4, 4], 2], + [128, [4, 4], 2], + [256, [4, 4], 2], + ] + # Small (1/2) Atari. + filters_42x42 = [ + [16, [4, 4], 2], + [32, [4, 4], 2], + [256, [11, 11], 1], + ] + # Test image (10x10). + filters_10x10 = [ + [16, [5, 5], 2], + [32, [5, 5], 2], + ] + + shape = list(shape) + if len(shape) in [2, 3] and (shape[:2] == [96, 96] or shape[1:] == [96, 96]): + return filters_96x96 + elif len(shape) in [2, 3] and (shape[:2] == [84, 84] or shape[1:] == [84, 84]): + return filters_84x84 + elif len(shape) in [2, 3] and (shape[:2] == [64, 64] or shape[1:] == [64, 64]): + return filters_64x64 + elif len(shape) in [2, 3] and (shape[:2] == [42, 42] or shape[1:] == [42, 42]): + return filters_42x42 + elif len(shape) in [2, 3] and (shape[:2] == [10, 10] or shape[1:] == [10, 10]): + return filters_10x10 + else: + raise ValueError( + "No default configuration for obs shape {}".format(shape) + + ", you must specify `conv_filters` manually as a model option. " + "Default configurations are only available for inputs of the following " + "shapes: [42, 42, K], [84, 84, K], [64, 64, K], [10, 10, K]. You may " + "alternatively want to use a custom model or preprocessor." + ) + + +@DeveloperAPI +def get_initializer(name, framework="tf"): + """Returns a framework specific initializer, given a name string. + + Args: + name: One of "xavier_uniform" (default), "xavier_normal". + framework: One of "jax", "tf|tf2" or "torch". + + Returns: + A framework-specific initializer function, e.g. + tf.keras.initializers.GlorotUniform or + torch.nn.init.xavier_uniform_. + + Raises: + ValueError: If name is an unknown initializer. + """ + # Already a callable, return as-is. + if callable(name): + return name + + if framework == "jax": + _, flax = try_import_jax() + assert flax is not None, "`flax` not installed. Try `pip install jax flax`." + import flax.linen as nn + + if name in [None, "default", "xavier_uniform"]: + return nn.initializers.xavier_uniform() + elif name == "xavier_normal": + return nn.initializers.xavier_normal() + if framework == "torch": + _, nn = try_import_torch() + assert nn is not None, "`torch` not installed. Try `pip install torch`." + if name in [None, "default", "xavier_uniform"]: + return nn.init.xavier_uniform_ + elif name == "xavier_normal": + return nn.init.xavier_normal_ + else: + assert framework in ["tf", "tf2"], "Unsupported framework `{}`!".format( + framework + ) + tf1, tf, tfv = try_import_tf() + assert ( + tf is not None + ), "`tensorflow` not installed. Try `pip install tensorflow`." + if name in [None, "default", "xavier_uniform"]: + return tf.keras.initializers.GlorotUniform + elif name == "xavier_normal": + return tf.keras.initializers.GlorotNormal + + raise ValueError( + "Unknown activation ({}) for framework={}!".format(name, framework) + )