koichi12 commited on Feb 12, 2025

Commit

697a7f6

verified ·

1 Parent(s): b1f8d86

Add files using upload-large-folder tool

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitattributes +1 -0
.venv/lib/python3.11/site-packages/ray/rllib/env/__pycache__/multi_agent_episode.cpython-311.pyc +3 -0
.venv/lib/python3.11/site-packages/ray/rllib/evaluation/__init__.py +20 -0
.venv/lib/python3.11/site-packages/ray/rllib/evaluation/__pycache__/__init__.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/rllib/evaluation/__pycache__/env_runner_v2.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/rllib/evaluation/__pycache__/episode_v2.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/rllib/evaluation/__pycache__/metrics.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/rllib/evaluation/__pycache__/postprocessing.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/rllib/evaluation/__pycache__/rollout_worker.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/rllib/evaluation/__pycache__/sampler.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/rllib/evaluation/collectors/simple_list_collector.py +698 -0
.venv/lib/python3.11/site-packages/ray/rllib/evaluation/env_runner_v2.py +1232 -0
.venv/lib/python3.11/site-packages/ray/rllib/evaluation/episode_v2.py +378 -0
.venv/lib/python3.11/site-packages/ray/rllib/evaluation/metrics.py +266 -0
.venv/lib/python3.11/site-packages/ray/rllib/evaluation/observation_function.py +87 -0
.venv/lib/python3.11/site-packages/ray/rllib/evaluation/postprocessing.py +328 -0
.venv/lib/python3.11/site-packages/ray/rllib/evaluation/rollout_worker.py +2004 -0
.venv/lib/python3.11/site-packages/ray/rllib/evaluation/sample_batch_builder.py +264 -0
.venv/lib/python3.11/site-packages/ray/rllib/evaluation/sampler.py +253 -0
.venv/lib/python3.11/site-packages/ray/rllib/evaluation/worker_set.py +10 -0
.venv/lib/python3.11/site-packages/ray/rllib/models/torch/__pycache__/__init__.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/rllib/models/torch/__pycache__/attention_net.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/rllib/models/torch/__pycache__/fcnet.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/rllib/models/torch/__pycache__/mingpt.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/rllib/models/torch/__pycache__/recurrent_net.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/rllib/models/torch/__pycache__/torch_action_dist.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/rllib/models/torch/__pycache__/torch_distributions.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/rllib/offline/__init__.py +30 -0
.venv/lib/python3.11/site-packages/ray/rllib/offline/__pycache__/dataset_reader.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/rllib/offline/__pycache__/dataset_writer.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/rllib/offline/__pycache__/feature_importance.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/rllib/offline/__pycache__/io_context.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/rllib/offline/__pycache__/is_estimator.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/rllib/offline/__pycache__/json_reader.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/rllib/offline/__pycache__/json_writer.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/rllib/offline/__pycache__/mixed_input.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/rllib/offline/__pycache__/off_policy_estimator.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/rllib/offline/__pycache__/offline_data.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/rllib/offline/__pycache__/offline_env_runner.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/rllib/offline/__pycache__/offline_evaluation_utils.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/rllib/offline/__pycache__/offline_evaluator.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/rllib/offline/__pycache__/offline_prelearner.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/rllib/offline/__pycache__/output_writer.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/rllib/offline/__pycache__/wis_estimator.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/rllib/offline/d4rl_reader.py +51 -0
.venv/lib/python3.11/site-packages/ray/rllib/offline/dataset_reader.py +289 -0
.venv/lib/python3.11/site-packages/ray/rllib/offline/dataset_writer.py +82 -0
.venv/lib/python3.11/site-packages/ray/rllib/offline/estimators/__pycache__/__init__.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/rllib/offline/estimators/__pycache__/direct_method.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/rllib/offline/estimators/__pycache__/fqe_torch_model.cpython-311.pyc +0 -0

.gitattributes CHANGED Viewed

@@ -178,3 +178,4 @@ tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_inductor/_
 .venv/lib/python3.11/site-packages/ray/_private/thirdparty/pynvml/__pycache__/pynvml.cpython-311.pyc filter=lfs diff=lfs merge=lfs -text
 .venv/lib/python3.11/site-packages/ray/rllib/algorithms/__pycache__/algorithm.cpython-311.pyc filter=lfs diff=lfs merge=lfs -text
 .venv/lib/python3.11/site-packages/ray/rllib/algorithms/__pycache__/algorithm_config.cpython-311.pyc filter=lfs diff=lfs merge=lfs -text

 .venv/lib/python3.11/site-packages/ray/_private/thirdparty/pynvml/__pycache__/pynvml.cpython-311.pyc filter=lfs diff=lfs merge=lfs -text
 .venv/lib/python3.11/site-packages/ray/rllib/algorithms/__pycache__/algorithm.cpython-311.pyc filter=lfs diff=lfs merge=lfs -text
 .venv/lib/python3.11/site-packages/ray/rllib/algorithms/__pycache__/algorithm_config.cpython-311.pyc filter=lfs diff=lfs merge=lfs -text
+.venv/lib/python3.11/site-packages/ray/rllib/env/__pycache__/multi_agent_episode.cpython-311.pyc filter=lfs diff=lfs merge=lfs -text

.venv/lib/python3.11/site-packages/ray/rllib/env/__pycache__/multi_agent_episode.cpython-311.pyc ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:70ee04d5ba78d502ad5d58d83cd6ec52ed3635c4af63ccc12837f71debf75e54
+size 115849

.venv/lib/python3.11/site-packages/ray/rllib/evaluation/__init__.py ADDED Viewed

	@@ -0,0 +1,20 @@

+from ray.rllib.evaluation.rollout_worker import RolloutWorker
+from ray.rllib.evaluation.sample_batch_builder import (
+    SampleBatchBuilder,
+    MultiAgentSampleBatchBuilder,
+)
+from ray.rllib.evaluation.sampler import SyncSampler
+from ray.rllib.evaluation.postprocessing import compute_advantages
+from ray.rllib.evaluation.metrics import collect_metrics
+from ray.rllib.policy.sample_batch import SampleBatch, MultiAgentBatch
+__all__ = [
+    "RolloutWorker",
+    "SampleBatch",
+    "MultiAgentBatch",
+    "SampleBatchBuilder",
+    "MultiAgentSampleBatchBuilder",
+    "SyncSampler",
+    "compute_advantages",
+    "collect_metrics",
+]

.venv/lib/python3.11/site-packages/ray/rllib/evaluation/__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (888 Bytes). View file

.venv/lib/python3.11/site-packages/ray/rllib/evaluation/__pycache__/env_runner_v2.cpython-311.pyc ADDED Viewed

Binary file (44.6 kB). View file

.venv/lib/python3.11/site-packages/ray/rllib/evaluation/__pycache__/episode_v2.cpython-311.pyc ADDED Viewed

Binary file (15.5 kB). View file

.venv/lib/python3.11/site-packages/ray/rllib/evaluation/__pycache__/metrics.cpython-311.pyc ADDED Viewed

Binary file (11.2 kB). View file

.venv/lib/python3.11/site-packages/ray/rllib/evaluation/__pycache__/postprocessing.cpython-311.pyc ADDED Viewed

Binary file (13.9 kB). View file

.venv/lib/python3.11/site-packages/ray/rllib/evaluation/__pycache__/rollout_worker.cpython-311.pyc ADDED Viewed

Binary file (85.5 kB). View file

.venv/lib/python3.11/site-packages/ray/rllib/evaluation/__pycache__/sampler.cpython-311.pyc ADDED Viewed

Binary file (12.3 kB). View file

.venv/lib/python3.11/site-packages/ray/rllib/evaluation/collectors/simple_list_collector.py ADDED Viewed

	@@ -0,0 +1,698 @@

+import collections
+from gymnasium.spaces import Space
+import logging
+import numpy as np
+import tree  # pip install dm_tree
+from typing import Dict, List, Optional, Tuple, TYPE_CHECKING, Union
+from ray.rllib.env.base_env import _DUMMY_AGENT_ID
+from ray.rllib.evaluation.collectors.sample_collector import SampleCollector
+from ray.rllib.evaluation.collectors.agent_collector import AgentCollector
+from ray.rllib.policy.policy import Policy
+from ray.rllib.policy.policy_map import PolicyMap
+from ray.rllib.policy.sample_batch import SampleBatch, MultiAgentBatch, concat_samples
+from ray.rllib.utils.annotations import OldAPIStack, override
+from ray.rllib.utils.debug import summarize
+from ray.rllib.utils.framework import try_import_tf, try_import_torch
+from ray.rllib.utils.spaces.space_utils import get_dummy_batch_for_space
+from ray.rllib.utils.typing import (
+    AgentID,
+    EpisodeID,
+    EnvID,
+    PolicyID,
+    TensorType,
+    ViewRequirementsDict,
+)
+from ray.util.debug import log_once
+_, tf, _ = try_import_tf()
+torch, _ = try_import_torch()
+if TYPE_CHECKING:
+    from ray.rllib.callbacks.callbacks import RLlibCallback
+logger = logging.getLogger(__name__)
+@OldAPIStack
+class _PolicyCollector:
+    """Collects already postprocessed (single agent) samples for one policy.
+    Samples come in through already postprocessed SampleBatches, which
+    contain single episode/trajectory data for a single agent and are then
+    appended to this policy's buffers.
+    """
+    def __init__(self, policy: Policy):
+        """Initializes a _PolicyCollector instance.
+        Args:
+            policy: The policy object.
+        """
+        self.batches = []
+        self.policy = policy
+        # The total timestep count for all agents that use this policy.
+        # NOTE: This is not an env-step count (across n agents). AgentA and
+        # agentB, both using this policy, acting in the same episode and both
+        # doing n steps would increase the count by 2*n.
+        self.agent_steps = 0
+    def add_postprocessed_batch_for_training(
+        self, batch: SampleBatch, view_requirements: ViewRequirementsDict
+    ) -> None:
+        """Adds a postprocessed SampleBatch (single agent) to our buffers.
+        Args:
+            batch: An individual agent's (one trajectory)
+                SampleBatch to be added to the Policy's buffers.
+            view_requirements: The view
+                requirements for the policy. This is so we know, whether a
+                view-column needs to be copied at all (not needed for
+                training).
+        """
+        # Add the agent's trajectory length to our count.
+        self.agent_steps += batch.count
+        # And remove columns not needed for training.
+        for view_col, view_req in view_requirements.items():
+            if view_col in batch and not view_req.used_for_training:
+                del batch[view_col]
+        self.batches.append(batch)
+    def build(self):
+        """Builds a SampleBatch for this policy from the collected data.
+        Also resets all buffers for further sample collection for this policy.
+        Returns:
+            SampleBatch: The SampleBatch with all thus-far collected data for
+                this policy.
+        """
+        # Create batch from our buffers.
+        batch = concat_samples(self.batches)
+        # Clear batches for future samples.
+        self.batches = []
+        # Reset agent steps to 0.
+        self.agent_steps = 0
+        # Add num_grad_updates counter to the policy's batch.
+        batch.num_grad_updates = self.policy.num_grad_updates
+        return batch
+class _PolicyCollectorGroup:
+    def __init__(self, policy_map):
+        self.policy_collectors = {}
+        # Total env-steps (1 env-step=up to N agents stepped).
+        self.env_steps = 0
+        # Total agent steps (1 agent-step=1 individual agent (out of N)
+        # stepped).
+        self.agent_steps = 0
+@OldAPIStack
+class SimpleListCollector(SampleCollector):
+    """Util to build SampleBatches for each policy in a multi-agent env.
+    Input data is per-agent, while output data is per-policy. There is an M:N
+    mapping between agents and policies. We retain one local batch builder
+    per agent. When an agent is done, then its local batch is appended into the
+    corresponding policy batch for the agent's policy.
+    """
+    def __init__(
+        self,
+        policy_map: PolicyMap,
+        clip_rewards: Union[bool, float],
+        callbacks: "RLlibCallback",
+        multiple_episodes_in_batch: bool = True,
+        rollout_fragment_length: int = 200,
+        count_steps_by: str = "env_steps",
+    ):
+        """Initializes a SimpleListCollector instance."""
+        super().__init__(
+            policy_map,
+            clip_rewards,
+            callbacks,
+            multiple_episodes_in_batch,
+            rollout_fragment_length,
+            count_steps_by,
+        )
+        self.large_batch_threshold: int = (
+            max(1000, self.rollout_fragment_length * 10)
+            if self.rollout_fragment_length != float("inf")
+            else 5000
+        )
+        # Whenever we observe a new episode+agent, add a new
+        # _SingleTrajectoryCollector.
+        self.agent_collectors: Dict[Tuple[EpisodeID, AgentID], AgentCollector] = {}
+        # Internal agent-key-to-policy-id map.
+        self.agent_key_to_policy_id = {}
+        # Pool of used/unused PolicyCollectorGroups (attached to episodes for
+        # across-episode multi-agent sample collection).
+        self.policy_collector_groups = []
+        # Agents to collect data from for the next forward pass (per policy).
+        self.forward_pass_agent_keys = {pid: [] for pid in self.policy_map.keys()}
+        self.forward_pass_size = {pid: 0 for pid in self.policy_map.keys()}
+        # Maps episode ID to the (non-built) env steps taken in this episode.
+        self.episode_steps: Dict[EpisodeID, int] = collections.defaultdict(int)
+        # Maps episode ID to the (non-built) individual agent steps in this
+        # episode.
+        self.agent_steps: Dict[EpisodeID, int] = collections.defaultdict(int)
+        # Maps episode ID to Episode.
+        self.episodes = {}
+    @override(SampleCollector)
+    def episode_step(self, episode) -> None:
+        episode_id = episode.episode_id
+        # In the rase case that an "empty" step is taken at the beginning of
+        # the episode (none of the agents has an observation in the obs-dict
+        # and thus does not take an action), we have seen the episode before
+        # and have to add it here to our registry.
+        if episode_id not in self.episodes:
+            self.episodes[episode_id] = episode
+        else:
+            assert episode is self.episodes[episode_id]
+        self.episode_steps[episode_id] += 1
+        episode.length += 1
+        # In case of "empty" env steps (no agent is stepping), the builder
+        # object may still be None.
+        if episode.batch_builder:
+            env_steps = episode.batch_builder.env_steps
+            num_individual_observations = sum(
+                c.agent_steps for c in episode.batch_builder.policy_collectors.values()
+            )
+            if num_individual_observations > self.large_batch_threshold and log_once(
+                "large_batch_warning"
+            ):
+                logger.warning(
+                    "More than {} observations in {} env steps for "
+                    "episode {} ".format(
+                        num_individual_observations, env_steps, episode_id
+                    )
+                    + "are buffered in the sampler. If this is more than you "
+                    "expected, check that that you set a horizon on your "
+                    "environment correctly and that it terminates at some "
+                    "point. Note: In multi-agent environments, "
+                    "`rollout_fragment_length` sets the batch size based on "
+                    "(across-agents) environment steps, not the steps of "
+                    "individual agents, which can result in unexpectedly "
+                    "large batches."
+                    + (
+                        "Also, you may be waiting for your Env to "
+                        "terminate (batch_mode=`complete_episodes`). Make sure "
+                        "it does at some point."
+                        if not self.multiple_episodes_in_batch
+                        else ""
+                    )
+                )
+    @override(SampleCollector)
+    def add_init_obs(
+        self,
+        *,
+        episode,
+        agent_id: AgentID,
+        env_id: EnvID,
+        policy_id: PolicyID,
+        init_obs: TensorType,
+        init_infos: Optional[Dict[str, TensorType]] = None,
+        t: int = -1,
+    ) -> None:
+        # Make sure our mappings are up to date.
+        agent_key = (episode.episode_id, agent_id)
+        self.agent_key_to_policy_id[agent_key] = policy_id
+        policy = self.policy_map[policy_id]
+        # Add initial obs to Trajectory.
+        assert agent_key not in self.agent_collectors
+        # TODO: determine exact shift-before based on the view-req shifts.
+        # get max_seq_len value (Default is 1)
+        try:
+            max_seq_len = policy.config["model"]["max_seq_len"]
+        except KeyError:
+            max_seq_len = 1
+        self.agent_collectors[agent_key] = AgentCollector(
+            policy.view_requirements,
+            max_seq_len=max_seq_len,
+            disable_action_flattening=policy.config.get(
+                "_disable_action_flattening", False
+            ),
+            intial_states=policy.get_initial_state(),
+            is_policy_recurrent=policy.is_recurrent(),
+        )
+        self.agent_collectors[agent_key].add_init_obs(
+            episode_id=episode.episode_id,
+            agent_index=episode._agent_index(agent_id),
+            env_id=env_id,
+            init_obs=init_obs,
+            init_infos=init_infos or {},
+            t=t,
+        )
+        self.episodes[episode.episode_id] = episode
+        if episode.batch_builder is None:
+            episode.batch_builder = (
+                self.policy_collector_groups.pop()
+                if self.policy_collector_groups
+                else _PolicyCollectorGroup(self.policy_map)
+            )
+        self._add_to_next_inference_call(agent_key)
+    @override(SampleCollector)
+    def add_action_reward_next_obs(
+        self,
+        episode_id: EpisodeID,
+        agent_id: AgentID,
+        env_id: EnvID,
+        policy_id: PolicyID,
+        agent_done: bool,
+        values: Dict[str, TensorType],
+    ) -> None:
+        # Make sure, episode/agent already has some (at least init) data.
+        agent_key = (episode_id, agent_id)
+        assert self.agent_key_to_policy_id[agent_key] == policy_id
+        assert agent_key in self.agent_collectors
+        self.agent_steps[episode_id] += 1
+        # Include the current agent id for multi-agent algorithms.
+        if agent_id != _DUMMY_AGENT_ID:
+            values["agent_id"] = agent_id
+        # Add action/reward/next-obs (and other data) to Trajectory.
+        self.agent_collectors[agent_key].add_action_reward_next_obs(values)
+        if not agent_done:
+            self._add_to_next_inference_call(agent_key)
+    @override(SampleCollector)
+    def total_env_steps(self) -> int:
+        # Add the non-built ongoing-episode env steps + the already built
+        # env-steps.
+        return sum(self.episode_steps.values()) + sum(
+            pg.env_steps for pg in self.policy_collector_groups.values()
+        )
+    @override(SampleCollector)
+    def total_agent_steps(self) -> int:
+        # Add the non-built ongoing-episode agent steps (still in the agent
+        # collectors) + the already built agent steps.
+        return sum(a.agent_steps for a in self.agent_collectors.values()) + sum(
+            pg.agent_steps for pg in self.policy_collector_groups.values()
+        )
+    @override(SampleCollector)
+    def get_inference_input_dict(self, policy_id: PolicyID) -> Dict[str, TensorType]:
+        policy = self.policy_map[policy_id]
+        keys = self.forward_pass_agent_keys[policy_id]
+        batch_size = len(keys)
+        # Return empty batch, if no forward pass to do.
+        if batch_size == 0:
+            return SampleBatch()
+        buffers = {}
+        for k in keys:
+            collector = self.agent_collectors[k]
+            buffers[k] = collector.buffers
+        # Use one agent's buffer_structs (they should all be the same).
+        buffer_structs = self.agent_collectors[keys[0]].buffer_structs
+        input_dict = {}
+        for view_col, view_req in policy.view_requirements.items():
+            # Not used for action computations.
+            if not view_req.used_for_compute_actions:
+                continue
+            # Create the batch of data from the different buffers.
+            data_col = view_req.data_col or view_col
+            delta = (
+                -1
+                if data_col
+                in [
+                    SampleBatch.OBS,
+                    SampleBatch.INFOS,
+                    SampleBatch.ENV_ID,
+                    SampleBatch.EPS_ID,
+                    SampleBatch.AGENT_INDEX,
+                    SampleBatch.T,
+                ]
+                else 0
+            )
+            # Range of shifts, e.g. "-100:0". Note: This includes index 0!
+            if view_req.shift_from is not None:
+                time_indices = (view_req.shift_from + delta, view_req.shift_to + delta)
+            # Single shift (e.g. -1) or list of shifts, e.g. [-4, -1, 0].
+            else:
+                time_indices = view_req.shift + delta
+            # Loop through agents and add up their data (batch).
+            data = None
+            for k in keys:
+                # Buffer for the data does not exist yet: Create dummy
+                # (zero) data.
+                if data_col not in buffers[k]:
+                    if view_req.data_col is not None:
+                        space = policy.view_requirements[view_req.data_col].space
+                    else:
+                        space = view_req.space
+                    if isinstance(space, Space):
+                        fill_value = get_dummy_batch_for_space(
+                            space,
+                            batch_size=0,
+                        )
+                    else:
+                        fill_value = space
+                    self.agent_collectors[k]._build_buffers({data_col: fill_value})
+                if data is None:
+                    data = [[] for _ in range(len(buffers[keys[0]][data_col]))]
+                # `shift_from` and `shift_to` are defined: User wants a
+                # view with some time-range.
+                if isinstance(time_indices, tuple):
+                    # `shift_to` == -1: Until the end (including(!) the
+                    # last item).
+                    if time_indices[1] == -1:
+                        for d, b in zip(data, buffers[k][data_col]):
+                            d.append(b[time_indices[0] :])
+                    # `shift_to` != -1: "Normal" range.
+                    else:
+                        for d, b in zip(data, buffers[k][data_col]):
+                            d.append(b[time_indices[0] : time_indices[1] + 1])
+                # Single index.
+                else:
+                    for d, b in zip(data, buffers[k][data_col]):
+                        d.append(b[time_indices])
+            np_data = [np.array(d) for d in data]
+            if data_col in buffer_structs:
+                input_dict[view_col] = tree.unflatten_as(
+                    buffer_structs[data_col], np_data
+                )
+            else:
+                input_dict[view_col] = np_data[0]
+        self._reset_inference_calls(policy_id)
+        return SampleBatch(
+            input_dict,
+            seq_lens=np.ones(batch_size, dtype=np.int32)
+            if "state_in_0" in input_dict
+            else None,
+        )
+    @override(SampleCollector)
+    def postprocess_episode(
+        self,
+        episode,
+        is_done: bool = False,
+        check_dones: bool = False,
+        build: bool = False,
+    ) -> Union[None, SampleBatch, MultiAgentBatch]:
+        episode_id = episode.episode_id
+        policy_collector_group = episode.batch_builder
+        # Build SampleBatches for the given episode.
+        pre_batches = {}
+        for (eps_id, agent_id), collector in self.agent_collectors.items():
+            # Build only if there is data and agent is part of given episode.
+            if collector.agent_steps == 0 or eps_id != episode_id:
+                continue
+            pid = self.agent_key_to_policy_id[(eps_id, agent_id)]
+            policy = self.policy_map[pid]
+            pre_batch = collector.build_for_training(policy.view_requirements)
+            pre_batches[agent_id] = (policy, pre_batch)
+        # Apply reward clipping before calling postprocessing functions.
+        if self.clip_rewards is True:
+            for _, (_, pre_batch) in pre_batches.items():
+                pre_batch["rewards"] = np.sign(pre_batch["rewards"])
+        elif self.clip_rewards:
+            for _, (_, pre_batch) in pre_batches.items():
+                pre_batch["rewards"] = np.clip(
+                    pre_batch["rewards"],
+                    a_min=-self.clip_rewards,
+                    a_max=self.clip_rewards,
+                )
+        post_batches = {}
+        for agent_id, (_, pre_batch) in pre_batches.items():
+            # Entire episode is said to be done.
+            # Error if no DONE at end of this agent's trajectory.
+            if is_done and check_dones and not pre_batch.is_terminated_or_truncated():
+                raise ValueError(
+                    "Episode {} terminated for all agents, but we still "
+                    "don't have a last observation for agent {} (policy "
+                    "{}). ".format(
+                        episode_id,
+                        agent_id,
+                        self.agent_key_to_policy_id[(episode_id, agent_id)],
+                    )
+                    + "Please ensure that you include the last observations "
+                    "of all live agents when setting truncated[__all__] or "
+                    "terminated[__all__] to True."
+                )
+            # Skip a trajectory's postprocessing (and thus using it for training),
+            # if its agent's info exists and contains the training_enabled=False
+            # setting (used by our PolicyClients).
+            last_info = episode.last_info_for(agent_id)
+            if last_info and not last_info.get("training_enabled", True):
+                if is_done:
+                    agent_key = (episode_id, agent_id)
+                    del self.agent_key_to_policy_id[agent_key]
+                    del self.agent_collectors[agent_key]
+                continue
+            if len(pre_batches) > 1:
+                other_batches = pre_batches.copy()
+                del other_batches[agent_id]
+            else:
+                other_batches = {}
+            pid = self.agent_key_to_policy_id[(episode_id, agent_id)]
+            policy = self.policy_map[pid]
+            if not pre_batch.is_single_trajectory():
+                raise ValueError(
+                    "Batches sent to postprocessing must be from a single trajectory! "
+                    "TERMINATED & TRUNCATED need to be False everywhere, except the "
+                    "last timestep, which can be either True or False for those keys)!",
+                    pre_batch,
+                )
+            elif len(set(pre_batch[SampleBatch.EPS_ID])) > 1:
+                episode_ids = set(pre_batch[SampleBatch.EPS_ID])
+                raise ValueError(
+                    "Batches sent to postprocessing must only contain steps "
+                    "from a single episode! Your trajectory contains data from "
+                    f"{len(episode_ids)} episodes ({list(episode_ids)}).",
+                    pre_batch,
+                )
+            # Call the Policy's Exploration's postprocess method.
+            post_batches[agent_id] = pre_batch
+            if getattr(policy, "exploration", None) is not None:
+                policy.exploration.postprocess_trajectory(
+                    policy, post_batches[agent_id], policy.get_session()
+                )
+            post_batches[agent_id].set_get_interceptor(None)
+            post_batches[agent_id] = policy.postprocess_trajectory(
+                post_batches[agent_id], other_batches, episode
+            )
+        if log_once("after_post"):
+            logger.info(
+                "Trajectory fragment after postprocess_trajectory():\n\n{}\n".format(
+                    summarize(post_batches)
+                )
+            )
+        # Append into policy batches and reset.
+        from ray.rllib.evaluation.rollout_worker import get_global_worker
+        for agent_id, post_batch in sorted(post_batches.items()):
+            agent_key = (episode_id, agent_id)
+            pid = self.agent_key_to_policy_id[agent_key]
+            policy = self.policy_map[pid]
+            self.callbacks.on_postprocess_trajectory(
+                worker=get_global_worker(),
+                episode=episode,
+                agent_id=agent_id,
+                policy_id=pid,
+                policies=self.policy_map,
+                postprocessed_batch=post_batch,
+                original_batches=pre_batches,
+            )
+            # Add the postprocessed SampleBatch to the policy collectors for
+            # training.
+            # PID may be a newly added policy. Just confirm we have it in our
+            # policy map before proceeding with adding a new _PolicyCollector()
+            # to the group.
+            if pid not in policy_collector_group.policy_collectors:
+                assert pid in self.policy_map
+                policy_collector_group.policy_collectors[pid] = _PolicyCollector(policy)
+            policy_collector_group.policy_collectors[
+                pid
+            ].add_postprocessed_batch_for_training(post_batch, policy.view_requirements)
+            if is_done:
+                del self.agent_key_to_policy_id[agent_key]
+                del self.agent_collectors[agent_key]
+        if policy_collector_group:
+            env_steps = self.episode_steps[episode_id]
+            policy_collector_group.env_steps += env_steps
+            agent_steps = self.agent_steps[episode_id]
+            policy_collector_group.agent_steps += agent_steps
+        if is_done:
+            del self.episode_steps[episode_id]
+            del self.episodes[episode_id]
+            if episode_id in self.agent_steps:
+                del self.agent_steps[episode_id]
+            else:
+                assert (
+                    len(pre_batches) == 0
+                ), "Expected the batch to be empty since the episode_id is missing."
+                # if the key does not exist it means that throughout the episode all
+                # observations were empty (i.e. there was no agent in the env)
+                msg = (
+                    f"Data from episode {episode_id} does not show any agent "
+                    f"interactions. Hint: Make sure for at least one timestep in the "
+                    f"episode, env.step() returns non-empty values."
+                )
+                raise ValueError(msg)
+            # Make PolicyCollectorGroup available for more agent batches in
+            # other episodes. Do not reset count to 0.
+            if policy_collector_group:
+                self.policy_collector_groups.append(policy_collector_group)
+        else:
+            self.episode_steps[episode_id] = self.agent_steps[episode_id] = 0
+        # Build a MultiAgentBatch from the episode and return.
+        if build:
+            return self._build_multi_agent_batch(episode)
+    def _build_multi_agent_batch(self, episode) -> Union[MultiAgentBatch, SampleBatch]:
+        ma_batch = {}
+        for pid, collector in episode.batch_builder.policy_collectors.items():
+            if collector.agent_steps > 0:
+                ma_batch[pid] = collector.build()
+        # TODO(sven): We should always return the same type here (MultiAgentBatch),
+        #  no matter what. Just have to unify our `training_step` methods, then. This
+        #  will reduce a lot of confusion about what comes out of the sampling process.
+        # Create the batch.
+        ma_batch = MultiAgentBatch.wrap_as_needed(
+            ma_batch, env_steps=episode.batch_builder.env_steps
+        )
+        # PolicyCollectorGroup is empty.
+        episode.batch_builder.env_steps = 0
+        episode.batch_builder.agent_steps = 0
+        return ma_batch
+    @override(SampleCollector)
+    def try_build_truncated_episode_multi_agent_batch(
+        self,
+    ) -> List[Union[MultiAgentBatch, SampleBatch]]:
+        batches = []
+        # Loop through ongoing episodes and see whether their length plus
+        # what's already in the policy collectors reaches the fragment-len
+        # (abiding to the unit used: env-steps or agent-steps).
+        for episode_id, episode in self.episodes.items():
+            # Measure batch size in env-steps.
+            if self.count_steps_by == "env_steps":
+                built_steps = (
+                    episode.batch_builder.env_steps if episode.batch_builder else 0
+                )
+                ongoing_steps = self.episode_steps[episode_id]
+            # Measure batch-size in agent-steps.
+            else:
+                built_steps = (
+                    episode.batch_builder.agent_steps if episode.batch_builder else 0
+                )
+                ongoing_steps = self.agent_steps[episode_id]
+            # Reached the fragment-len -> We should build an MA-Batch.
+            if built_steps + ongoing_steps >= self.rollout_fragment_length:
+                if self.count_steps_by == "env_steps":
+                    assert built_steps + ongoing_steps == self.rollout_fragment_length
+                # If we reached the fragment-len only because of `episode_id`
+                # (still ongoing) -> postprocess `episode_id` first.
+                if built_steps < self.rollout_fragment_length:
+                    self.postprocess_episode(episode, is_done=False)
+                # If there is a builder for this episode,
+                # build the MA-batch and add to return values.
+                if episode.batch_builder:
+                    batch = self._build_multi_agent_batch(episode=episode)
+                    batches.append(batch)
+                # No batch-builder:
+                # We have reached the rollout-fragment length w/o any agent
+                # steps! Warn that the environment may never request any
+                # actions from any agents.
+                elif log_once("no_agent_steps"):
+                    logger.warning(
+                        "Your environment seems to be stepping w/o ever "
+                        "emitting agent observations (agents are never "
+                        "requested to act)!"
+                    )
+        return batches
+    def _add_to_next_inference_call(self, agent_key: Tuple[EpisodeID, AgentID]) -> None:
+        """Adds an Agent key (episode+agent IDs) to the next inference call.
+        This makes sure that the agent's current data (in the trajectory) is
+        used for generating the next input_dict for a
+        `Policy.compute_actions()` call.
+        Args:
+            agent_key (Tuple[EpisodeID, AgentID]: A unique agent key (across
+                vectorized environments).
+        """
+        pid = self.agent_key_to_policy_id[agent_key]
+        # PID may be a newly added policy (added on the fly during training).
+        # Just confirm we have it in our policy map before proceeding with
+        # forward_pass_size=0.
+        if pid not in self.forward_pass_size:
+            assert pid in self.policy_map
+            self.forward_pass_size[pid] = 0
+            self.forward_pass_agent_keys[pid] = []
+        idx = self.forward_pass_size[pid]
+        assert idx >= 0
+        if idx == 0:
+            self.forward_pass_agent_keys[pid].clear()
+        self.forward_pass_agent_keys[pid].append(agent_key)
+        self.forward_pass_size[pid] += 1
+    def _reset_inference_calls(self, policy_id: PolicyID) -> None:
+        """Resets internal inference input-dict registries.
+        Calling `self.get_inference_input_dict()` after this method is called
+        would return an empty input-dict.
+        Args:
+            policy_id: The policy ID for which to reset the
+                inference pointers.
+        """
+        self.forward_pass_size[policy_id] = 0

.venv/lib/python3.11/site-packages/ray/rllib/evaluation/env_runner_v2.py ADDED Viewed

	@@ -0,0 +1,1232 @@

+from collections import defaultdict
+import logging
+import time
+import tree  # pip install dm_tree
+from typing import TYPE_CHECKING, Dict, Iterator, List, Optional, Set, Tuple, Union
+import numpy as np
+from ray.rllib.env.base_env import ASYNC_RESET_RETURN, BaseEnv
+from ray.rllib.env.external_env import ExternalEnvWrapper
+from ray.rllib.env.wrappers.atari_wrappers import MonitorEnv, get_wrapper_by_cls
+from ray.rllib.evaluation.collectors.simple_list_collector import _PolicyCollectorGroup
+from ray.rllib.evaluation.episode_v2 import EpisodeV2
+from ray.rllib.evaluation.metrics import RolloutMetrics
+from ray.rllib.models.preprocessors import Preprocessor
+from ray.rllib.policy.policy import Policy
+from ray.rllib.policy.sample_batch import MultiAgentBatch, SampleBatch, concat_samples
+from ray.rllib.utils.annotations import OldAPIStack
+from ray.rllib.utils.filter import Filter
+from ray.rllib.utils.numpy import convert_to_numpy
+from ray.rllib.utils.spaces.space_utils import unbatch, get_original_space
+from ray.rllib.utils.typing import (
+    ActionConnectorDataType,
+    AgentConnectorDataType,
+    AgentID,
+    EnvActionType,
+    EnvID,
+    EnvInfoDict,
+    EnvObsType,
+    MultiAgentDict,
+    MultiEnvDict,
+    PolicyID,
+    PolicyOutputType,
+    SampleBatchType,
+    StateBatches,
+    TensorStructType,
+)
+from ray.util.debug import log_once
+if TYPE_CHECKING:
+    from gymnasium.envs.classic_control.rendering import SimpleImageViewer
+    from ray.rllib.callbacks.callbacks import RLlibCallback
+    from ray.rllib.evaluation.rollout_worker import RolloutWorker
+logger = logging.getLogger(__name__)
+MIN_LARGE_BATCH_THRESHOLD = 1000
+DEFAULT_LARGE_BATCH_THRESHOLD = 5000
+MS_TO_SEC = 1000.0
+@OldAPIStack
+class _PerfStats:
+    """Sampler perf stats that will be included in rollout metrics."""
+    def __init__(self, ema_coef: Optional[float] = None):
+        # If not None, enable Exponential Moving Average mode.
+        # The way we update stats is by:
+        #     updated = (1 - ema_coef) * old + ema_coef * new
+        # In general provides more responsive stats about sampler performance.
+        # TODO(jungong) : make ema the default (only) mode if it works well.
+        self.ema_coef = ema_coef
+        self.iters = 0
+        self.raw_obs_processing_time = 0.0
+        self.inference_time = 0.0
+        self.action_processing_time = 0.0
+        self.env_wait_time = 0.0
+        self.env_render_time = 0.0
+    def incr(self, field: str, value: Union[int, float]):
+        if field == "iters":
+            self.iters += value
+            return
+        # All the other fields support either global average or ema mode.
+        if self.ema_coef is None:
+            # Global average.
+            self.__dict__[field] += value
+        else:
+            self.__dict__[field] = (1.0 - self.ema_coef) * self.__dict__[
+                field
+            ] + self.ema_coef * value
+    def _get_avg(self):
+        # Mean multiplicator (1000 = sec -> ms).
+        factor = MS_TO_SEC / self.iters
+        return {
+            # Raw observation preprocessing.
+            "mean_raw_obs_processing_ms": self.raw_obs_processing_time * factor,
+            # Computing actions through policy.
+            "mean_inference_ms": self.inference_time * factor,
+            # Processing actions (to be sent to env, e.g. clipping).
+            "mean_action_processing_ms": self.action_processing_time * factor,
+            # Waiting for environment (during poll).
+            "mean_env_wait_ms": self.env_wait_time * factor,
+            # Environment rendering (False by default).
+            "mean_env_render_ms": self.env_render_time * factor,
+        }
+    def _get_ema(self):
+        # In EMA mode, stats are already (exponentially) averaged,
+        # hence we only need to do the sec -> ms conversion here.
+        return {
+            # Raw observation preprocessing.
+            "mean_raw_obs_processing_ms": self.raw_obs_processing_time * MS_TO_SEC,
+            # Computing actions through policy.
+            "mean_inference_ms": self.inference_time * MS_TO_SEC,
+            # Processing actions (to be sent to env, e.g. clipping).
+            "mean_action_processing_ms": self.action_processing_time * MS_TO_SEC,
+            # Waiting for environment (during poll).
+            "mean_env_wait_ms": self.env_wait_time * MS_TO_SEC,
+            # Environment rendering (False by default).
+            "mean_env_render_ms": self.env_render_time * MS_TO_SEC,
+        }
+    def get(self):
+        if self.ema_coef is None:
+            return self._get_avg()
+        else:
+            return self._get_ema()
+@OldAPIStack
+class _NewDefaultDict(defaultdict):
+    def __missing__(self, env_id):
+        ret = self[env_id] = self.default_factory(env_id)
+        return ret
+@OldAPIStack
+def _build_multi_agent_batch(
+    episode_id: int,
+    batch_builder: _PolicyCollectorGroup,
+    large_batch_threshold: int,
+    multiple_episodes_in_batch: bool,
+) -> MultiAgentBatch:
+    """Build MultiAgentBatch from a dict of _PolicyCollectors.
+    Args:
+        env_steps: total env steps.
+        policy_collectors: collected training SampleBatchs by policy.
+    Returns:
+        Always returns a sample batch in MultiAgentBatch format.
+    """
+    ma_batch = {}
+    for pid, collector in batch_builder.policy_collectors.items():
+        if collector.agent_steps <= 0:
+            continue
+        if batch_builder.agent_steps > large_batch_threshold and log_once(
+            "large_batch_warning"
+        ):
+            logger.warning(
+                "More than {} observations in {} env steps for "
+                "episode {} ".format(
+                    batch_builder.agent_steps, batch_builder.env_steps, episode_id
+                )
+                + "are buffered in the sampler. If this is more than you "
+                "expected, check that that you set a horizon on your "
+                "environment correctly and that it terminates at some "
+                "point. Note: In multi-agent environments, "
+                "`rollout_fragment_length` sets the batch size based on "
+                "(across-agents) environment steps, not the steps of "
+                "individual agents, which can result in unexpectedly "
+                "large batches."
+                + (
+                    "Also, you may be waiting for your Env to "
+                    "terminate (batch_mode=`complete_episodes`). Make sure "
+                    "it does at some point."
+                    if not multiple_episodes_in_batch
+                    else ""
+                )
+            )
+        batch = collector.build()
+        ma_batch[pid] = batch
+    # Create the multi agent batch.
+    return MultiAgentBatch(policy_batches=ma_batch, env_steps=batch_builder.env_steps)
+@OldAPIStack
+def _batch_inference_sample_batches(eval_data: List[SampleBatch]) -> SampleBatch:
+    """Batch a list of input SampleBatches into a single SampleBatch.
+    Args:
+        eval_data: list of SampleBatches.
+    Returns:
+        single batched SampleBatch.
+    """
+    inference_batch = concat_samples(eval_data)
+    if "state_in_0" in inference_batch:
+        batch_size = len(eval_data)
+        inference_batch[SampleBatch.SEQ_LENS] = np.ones(batch_size, dtype=np.int32)
+    return inference_batch
+@OldAPIStack
+class EnvRunnerV2:
+    """Collect experiences from user environment using Connectors."""
+    def __init__(
+        self,
+        worker: "RolloutWorker",
+        base_env: BaseEnv,
+        multiple_episodes_in_batch: bool,
+        callbacks: "RLlibCallback",
+        perf_stats: _PerfStats,
+        rollout_fragment_length: int = 200,
+        count_steps_by: str = "env_steps",
+        render: bool = None,
+    ):
+        """
+        Args:
+            worker: Reference to the current rollout worker.
+            base_env: Env implementing BaseEnv.
+            multiple_episodes_in_batch: Whether to pack multiple
+                episodes into each batch. This guarantees batches will be exactly
+                `rollout_fragment_length` in size.
+            callbacks: User callbacks to run on episode events.
+            perf_stats: Record perf stats into this object.
+            rollout_fragment_length: The length of a fragment to collect
+                before building a SampleBatch from the data and resetting
+                the SampleBatchBuilder object.
+            count_steps_by: One of "env_steps" (default) or "agent_steps".
+                Use "agent_steps", if you want rollout lengths to be counted
+                by individual agent steps. In a multi-agent env,
+                a single env_step contains one or more agent_steps, depending
+                on how many agents are present at any given time in the
+                ongoing episode.
+            render: Whether to try to render the environment after each
+                step.
+        """
+        self._worker = worker
+        if isinstance(base_env, ExternalEnvWrapper):
+            raise ValueError(
+                "Policies using the new Connector API do not support ExternalEnv."
+            )
+        self._base_env = base_env
+        self._multiple_episodes_in_batch = multiple_episodes_in_batch
+        self._callbacks = callbacks
+        self._perf_stats = perf_stats
+        self._rollout_fragment_length = rollout_fragment_length
+        self._count_steps_by = count_steps_by
+        self._render = render
+        # May be populated for image rendering.
+        self._simple_image_viewer: Optional[
+            "SimpleImageViewer"
+        ] = self._get_simple_image_viewer()
+        # Keeps track of active episodes.
+        self._active_episodes: Dict[EnvID, EpisodeV2] = {}
+        self._batch_builders: Dict[EnvID, _PolicyCollectorGroup] = _NewDefaultDict(
+            self._new_batch_builder
+        )
+        self._large_batch_threshold: int = (
+            max(MIN_LARGE_BATCH_THRESHOLD, self._rollout_fragment_length * 10)
+            if self._rollout_fragment_length != float("inf")
+            else DEFAULT_LARGE_BATCH_THRESHOLD
+        )
+    def _get_simple_image_viewer(self):
+        """Maybe construct a SimpleImageViewer instance for episode rendering."""
+        # Try to render the env, if required.
+        if not self._render:
+            return None
+        try:
+            from gymnasium.envs.classic_control.rendering import SimpleImageViewer
+            return SimpleImageViewer()
+        except (ImportError, ModuleNotFoundError):
+            self._render = False  # disable rendering
+            logger.warning(
+                "Could not import gymnasium.envs.classic_control."
+                "rendering! Try `pip install gymnasium[all]`."
+            )
+        return None
+    def _call_on_episode_start(self, episode, env_id):
+        # Call each policy's Exploration.on_episode_start method.
+        # Note: This may break the exploration (e.g. ParameterNoise) of
+        # policies in the `policy_map` that have not been recently used
+        # (and are therefore stashed to disk). However, we certainly do not
+        # want to loop through all (even stashed) policies here as that
+        # would counter the purpose of the LRU policy caching.
+        for p in self._worker.policy_map.cache.values():
+            if getattr(p, "exploration", None) is not None:
+                p.exploration.on_episode_start(
+                    policy=p,
+                    environment=self._base_env,
+                    episode=episode,
+                    tf_sess=p.get_session(),
+                )
+        # Call `on_episode_start()` callback.
+        self._callbacks.on_episode_start(
+            worker=self._worker,
+            base_env=self._base_env,
+            policies=self._worker.policy_map,
+            env_index=env_id,
+            episode=episode,
+        )
+    def _new_batch_builder(self, _) -> _PolicyCollectorGroup:
+        """Create a new batch builder.
+        We create a _PolicyCollectorGroup based on the full policy_map
+        as the batch builder.
+        """
+        return _PolicyCollectorGroup(self._worker.policy_map)
+    def run(self) -> Iterator[SampleBatchType]:
+        """Samples and yields training episodes continuously.
+        Yields:
+            Object containing state, action, reward, terminal condition,
+            and other fields as dictated by `policy`.
+        """
+        while True:
+            outputs = self.step()
+            for o in outputs:
+                yield o
+    def step(self) -> List[SampleBatchType]:
+        """Samples training episodes by stepping through environments."""
+        self._perf_stats.incr("iters", 1)
+        t0 = time.time()
+        # Get observations from all ready agents.
+        # types: MultiEnvDict, MultiEnvDict, MultiEnvDict, MultiEnvDict, ...
+        (
+            unfiltered_obs,
+            rewards,
+            terminateds,
+            truncateds,
+            infos,
+            off_policy_actions,
+        ) = self._base_env.poll()
+        env_poll_time = time.time() - t0
+        # Process observations and prepare for policy evaluation.
+        t1 = time.time()
+        # types: Set[EnvID], Dict[PolicyID, List[AgentConnectorDataType]],
+        #       List[Union[RolloutMetrics, SampleBatchType]]
+        active_envs, to_eval, outputs = self._process_observations(
+            unfiltered_obs=unfiltered_obs,
+            rewards=rewards,
+            terminateds=terminateds,
+            truncateds=truncateds,
+            infos=infos,
+        )
+        self._perf_stats.incr("raw_obs_processing_time", time.time() - t1)
+        # Do batched policy eval (accross vectorized envs).
+        t2 = time.time()
+        # types: Dict[PolicyID, Tuple[TensorStructType, StateBatch, dict]]
+        eval_results = self._do_policy_eval(to_eval=to_eval)
+        self._perf_stats.incr("inference_time", time.time() - t2)
+        # Process results and update episode state.
+        t3 = time.time()
+        actions_to_send: Dict[
+            EnvID, Dict[AgentID, EnvActionType]
+        ] = self._process_policy_eval_results(
+            active_envs=active_envs,
+            to_eval=to_eval,
+            eval_results=eval_results,
+            off_policy_actions=off_policy_actions,
+        )
+        self._perf_stats.incr("action_processing_time", time.time() - t3)
+        # Return computed actions to ready envs. We also send to envs that have
+        # taken off-policy actions; those envs are free to ignore the action.
+        t4 = time.time()
+        self._base_env.send_actions(actions_to_send)
+        self._perf_stats.incr("env_wait_time", env_poll_time + time.time() - t4)
+        self._maybe_render()
+        return outputs
+    def _get_rollout_metrics(
+        self, episode: EpisodeV2, policy_map: Dict[str, Policy]
+    ) -> List[RolloutMetrics]:
+        """Get rollout metrics from completed episode."""
+        # TODO(jungong) : why do we need to handle atari metrics differently?
+        # Can we unify atari and normal env metrics?
+        atari_metrics: List[RolloutMetrics] = _fetch_atari_metrics(self._base_env)
+        if atari_metrics is not None:
+            for m in atari_metrics:
+                m._replace(custom_metrics=episode.custom_metrics)
+            return atari_metrics
+        # Create connector metrics
+        connector_metrics = {}
+        active_agents = episode.get_agents()
+        for agent in active_agents:
+            policy_id = episode.policy_for(agent)
+            policy = episode.policy_map[policy_id]
+            connector_metrics[policy_id] = policy.get_connector_metrics()
+        # Otherwise, return RolloutMetrics for the episode.
+        return [
+            RolloutMetrics(
+                episode_length=episode.length,
+                episode_reward=episode.total_reward,
+                agent_rewards=dict(episode.agent_rewards),
+                custom_metrics=episode.custom_metrics,
+                perf_stats={},
+                hist_data=episode.hist_data,
+                media=episode.media,
+                connector_metrics=connector_metrics,
+            )
+        ]
+    def _process_observations(
+        self,
+        unfiltered_obs: MultiEnvDict,
+        rewards: MultiEnvDict,
+        terminateds: MultiEnvDict,
+        truncateds: MultiEnvDict,
+        infos: MultiEnvDict,
+    ) -> Tuple[
+        Set[EnvID],
+        Dict[PolicyID, List[AgentConnectorDataType]],
+        List[Union[RolloutMetrics, SampleBatchType]],
+    ]:
+        """Process raw obs from env.
+        Group data for active agents by policy. Reset environments that are done.
+        Args:
+            unfiltered_obs: The unfiltered, raw observations from the BaseEnv
+                (vectorized, possibly multi-agent). Dict of dict: By env index,
+                then agent ID, then mapped to actual obs.
+            rewards: The rewards MultiEnvDict of the BaseEnv.
+            terminateds: The `terminated` flags MultiEnvDict of the BaseEnv.
+            truncateds: The `truncated` flags MultiEnvDict of the BaseEnv.
+            infos: The MultiEnvDict of infos dicts of the BaseEnv.
+        Returns:
+            A tuple of:
+                A list of envs that were active during this step.
+                AgentConnectorDataType for active agents for policy evaluation.
+                SampleBatches and RolloutMetrics for completed agents for output.
+        """
+        # Output objects.
+        # Note that we need to track envs that are active during this round explicitly,
+        # just to be confident which envs require us to send at least an empty action
+        # dict to.
+        # We can not get this from the _active_episode or to_eval lists because
+        # 1. All envs are not required to step during every single step. And
+        # 2. to_eval only contains data for the agents that are still active. An env may
+        # be active but all agents are done during the step.
+        active_envs: Set[EnvID] = set()
+        to_eval: Dict[PolicyID, List[AgentConnectorDataType]] = defaultdict(list)
+        outputs: List[Union[RolloutMetrics, SampleBatchType]] = []
+        # For each (vectorized) sub-environment.
+        # types: EnvID, Dict[AgentID, EnvObsType]
+        for env_id, env_obs in unfiltered_obs.items():
+            # Check for env_id having returned an error instead of a multi-agent
+            # obs dict. This is how our BaseEnv can tell the caller to `poll()` that
+            # one of its sub-environments is faulty and should be restarted (and the
+            # ongoing episode should not be used for training).
+            if isinstance(env_obs, Exception):
+                assert terminateds[env_id]["__all__"] is True, (
+                    f"ERROR: When a sub-environment (env-id {env_id}) returns an error "
+                    "as observation, the terminateds[__all__] flag must also be set to "
+                    "True!"
+                )
+                # all_agents_obs is an Exception here.
+                # Drop this episode and skip to next.
+                self._handle_done_episode(
+                    env_id=env_id,
+                    env_obs_or_exception=env_obs,
+                    is_done=True,
+                    active_envs=active_envs,
+                    to_eval=to_eval,
+                    outputs=outputs,
+                )
+                continue
+            if env_id not in self._active_episodes:
+                episode: EpisodeV2 = self.create_episode(env_id)
+                self._active_episodes[env_id] = episode
+            else:
+                episode: EpisodeV2 = self._active_episodes[env_id]
+            # If this episode is brand-new, call the episode start callback(s).
+            # Note: EpisodeV2s are initialized with length=-1 (before the reset).
+            if not episode.has_init_obs():
+                self._call_on_episode_start(episode, env_id)
+            # Check episode termination conditions.
+            if terminateds[env_id]["__all__"] or truncateds[env_id]["__all__"]:
+                all_agents_done = True
+            else:
+                all_agents_done = False
+                active_envs.add(env_id)
+            # Special handling of common info dict.
+            episode.set_last_info("__common__", infos[env_id].get("__common__", {}))
+            # Agent sample batches grouped by policy. Each set of sample batches will
+            # go through agent connectors together.
+            sample_batches_by_policy = defaultdict(list)
+            # Whether an agent is terminated or truncated.
+            agent_terminateds = {}
+            agent_truncateds = {}
+            for agent_id, obs in env_obs.items():
+                assert agent_id != "__all__"
+                policy_id: PolicyID = episode.policy_for(agent_id)
+                agent_terminated = bool(
+                    terminateds[env_id]["__all__"] or terminateds[env_id].get(agent_id)
+                )
+                agent_terminateds[agent_id] = agent_terminated
+                agent_truncated = bool(
+                    truncateds[env_id]["__all__"]
+                    or truncateds[env_id].get(agent_id, False)
+                )
+                agent_truncateds[agent_id] = agent_truncated
+                # A completely new agent is already done -> Skip entirely.
+                if not episode.has_init_obs(agent_id) and (
+                    agent_terminated or agent_truncated
+                ):
+                    continue
+                values_dict = {
+                    SampleBatch.T: episode.length,  # Episodes start at -1 before we
+                    # add the initial obs. After that, we infer from initial obs at
+                    # t=0 since that will be our new episode.length.
+                    SampleBatch.ENV_ID: env_id,
+                    SampleBatch.AGENT_INDEX: episode.agent_index(agent_id),
+                    # Last action (SampleBatch.ACTIONS) column will be populated by
+                    # StateBufferConnector.
+                    # Reward received after taking action at timestep t.
+                    SampleBatch.REWARDS: rewards[env_id].get(agent_id, 0.0),
+                    # After taking action=a, did we reach terminal?
+                    SampleBatch.TERMINATEDS: agent_terminated,
+                    # Was the episode truncated artificially
+                    # (e.g. b/c of some time limit)?
+                    SampleBatch.TRUNCATEDS: agent_truncated,
+                    SampleBatch.INFOS: infos[env_id].get(agent_id, {}),
+                    SampleBatch.NEXT_OBS: obs,
+                }
+                # Queue this obs sample for connector preprocessing.
+                sample_batches_by_policy[policy_id].append((agent_id, values_dict))
+            # The entire episode is done.
+            if all_agents_done:
+                # Let's check to see if there are any agents that haven't got the
+                # last obs yet. If there are, we have to create fake-last
+                # observations for them. (the environment is not required to do so if
+                # terminateds[__all__]==True or truncateds[__all__]==True).
+                for agent_id in episode.get_agents():
+                    # If the latest obs we got for this agent is done, or if its
+                    # episode state is already done, nothing to do.
+                    if (
+                        agent_terminateds.get(agent_id, False)
+                        or agent_truncateds.get(agent_id, False)
+                        or episode.is_done(agent_id)
+                    ):
+                        continue
+                    policy_id: PolicyID = episode.policy_for(agent_id)
+                    policy = self._worker.policy_map[policy_id]
+                    # Create a fake observation by sampling the original env
+                    # observation space.
+                    obs_space = get_original_space(policy.observation_space)
+                    # Although there is no obs for this agent, there may be
+                    # good rewards and info dicts for it.
+                    # This is the case for e.g. OpenSpiel games, where a reward
+                    # is only earned with the last step, but the obs for that
+                    # step is {}.
+                    reward = rewards[env_id].get(agent_id, 0.0)
+                    info = infos[env_id].get(agent_id, {})
+                    values_dict = {
+                        SampleBatch.T: episode.length,
+                        SampleBatch.ENV_ID: env_id,
+                        SampleBatch.AGENT_INDEX: episode.agent_index(agent_id),
+                        # TODO(sven): These should be the summed-up(!) rewards since the
+                        #  last observation received for this agent.
+                        SampleBatch.REWARDS: reward,
+                        SampleBatch.TERMINATEDS: True,
+                        SampleBatch.TRUNCATEDS: truncateds[env_id].get(agent_id, False),
+                        SampleBatch.INFOS: info,
+                        SampleBatch.NEXT_OBS: obs_space.sample(),
+                    }
+                    # Queue these fake obs for connector preprocessing too.
+                    sample_batches_by_policy[policy_id].append((agent_id, values_dict))
+            # Run agent connectors.
+            for policy_id, batches in sample_batches_by_policy.items():
+                policy: Policy = self._worker.policy_map[policy_id]
+                # Collected full MultiAgentDicts for this environment.
+                # Run agent connectors.
+                assert (
+                    policy.agent_connectors
+                ), "EnvRunnerV2 requires agent connectors to work."
+                acd_list: List[AgentConnectorDataType] = [
+                    AgentConnectorDataType(env_id, agent_id, data)
+                    for agent_id, data in batches
+                ]
+                # For all agents mapped to policy_id, run their data
+                # through agent_connectors.
+                processed = policy.agent_connectors(acd_list)
+                for d in processed:
+                    # Record transition info if applicable.
+                    if not episode.has_init_obs(d.agent_id):
+                        episode.add_init_obs(
+                            agent_id=d.agent_id,
+                            init_obs=d.data.raw_dict[SampleBatch.NEXT_OBS],
+                            init_infos=d.data.raw_dict[SampleBatch.INFOS],
+                            t=d.data.raw_dict[SampleBatch.T],
+                        )
+                    else:
+                        episode.add_action_reward_done_next_obs(
+                            d.agent_id, d.data.raw_dict
+                        )
+                    # Need to evaluate next actions.
+                    if not (
+                        all_agents_done
+                        or agent_terminateds.get(d.agent_id, False)
+                        or agent_truncateds.get(d.agent_id, False)
+                        or episode.is_done(d.agent_id)
+                    ):
+                        # Add to eval set if env is not done and this particular agent
+                        # is also not done.
+                        item = AgentConnectorDataType(d.env_id, d.agent_id, d.data)
+                        to_eval[policy_id].append(item)
+            # Finished advancing episode by 1 step, mark it so.
+            episode.step()
+            # Exception: The very first env.poll() call causes the env to get reset
+            # (no step taken yet, just a single starting observation logged).
+            # We need to skip this callback in this case.
+            if episode.length > 0:
+                # Invoke the `on_episode_step` callback after the step is logged
+                # to the episode.
+                self._callbacks.on_episode_step(
+                    worker=self._worker,
+                    base_env=self._base_env,
+                    policies=self._worker.policy_map,
+                    episode=episode,
+                    env_index=env_id,
+                )
+            # Episode is terminated/truncated for all agents
+            # (terminateds[__all__] == True or truncateds[__all__] == True).
+            if all_agents_done:
+                # _handle_done_episode will build a MultiAgentBatch for all
+                # the agents that are done during this step of rollout in
+                # the case of _multiple_episodes_in_batch=False.
+                self._handle_done_episode(
+                    env_id,
+                    env_obs,
+                    terminateds[env_id]["__all__"] or truncateds[env_id]["__all__"],
+                    active_envs,
+                    to_eval,
+                    outputs,
+                )
+            # Try to build something.
+            if self._multiple_episodes_in_batch:
+                sample_batch = self._try_build_truncated_episode_multi_agent_batch(
+                    self._batch_builders[env_id], episode
+                )
+                if sample_batch:
+                    outputs.append(sample_batch)
+                    # SampleBatch built from data collected by batch_builder.
+                    # Clean up and delete the batch_builder.
+                    del self._batch_builders[env_id]
+        return active_envs, to_eval, outputs
+    def _build_done_episode(
+        self,
+        env_id: EnvID,
+        is_done: bool,
+        outputs: List[SampleBatchType],
+    ):
+        """Builds a MultiAgentSampleBatch from the episode and adds it to outputs.
+        Args:
+            env_id: The env id.
+            is_done: Whether the env is done.
+            outputs: The list of outputs to add the
+        """
+        episode: EpisodeV2 = self._active_episodes[env_id]
+        batch_builder = self._batch_builders[env_id]
+        episode.postprocess_episode(
+            batch_builder=batch_builder,
+            is_done=is_done,
+            check_dones=is_done,
+        )
+        # If, we are not allowed to pack the next episode into the same
+        # SampleBatch (batch_mode=complete_episodes) -> Build the
+        # MultiAgentBatch from a single episode and add it to "outputs".
+        # Otherwise, just postprocess and continue collecting across
+        # episodes.
+        if not self._multiple_episodes_in_batch:
+            ma_sample_batch = _build_multi_agent_batch(
+                episode.episode_id,
+                batch_builder,
+                self._large_batch_threshold,
+                self._multiple_episodes_in_batch,
+            )
+            if ma_sample_batch:
+                outputs.append(ma_sample_batch)
+            # SampleBatch built from data collected by batch_builder.
+            # Clean up and delete the batch_builder.
+            del self._batch_builders[env_id]
+    def __process_resetted_obs_for_eval(
+        self,
+        env_id: EnvID,
+        obs: Dict[EnvID, Dict[AgentID, EnvObsType]],
+        infos: Dict[EnvID, Dict[AgentID, EnvInfoDict]],
+        episode: EpisodeV2,
+        to_eval: Dict[PolicyID, List[AgentConnectorDataType]],
+    ):
+        """Process resetted obs through agent connectors for policy eval.
+        Args:
+            env_id: The env id.
+            obs: The Resetted obs.
+            episode: New episode.
+            to_eval: List of agent connector data for policy eval.
+        """
+        per_policy_resetted_obs: Dict[PolicyID, List] = defaultdict(list)
+        # types: AgentID, EnvObsType
+        for agent_id, raw_obs in obs[env_id].items():
+            policy_id: PolicyID = episode.policy_for(agent_id)
+            per_policy_resetted_obs[policy_id].append((agent_id, raw_obs))
+        for policy_id, agents_obs in per_policy_resetted_obs.items():
+            policy = self._worker.policy_map[policy_id]
+            acd_list: List[AgentConnectorDataType] = [
+                AgentConnectorDataType(
+                    env_id,
+                    agent_id,
+                    {
+                        SampleBatch.NEXT_OBS: obs,
+                        SampleBatch.INFOS: infos,
+                        SampleBatch.T: episode.length,
+                        SampleBatch.AGENT_INDEX: episode.agent_index(agent_id),
+                    },
+                )
+                for agent_id, obs in agents_obs
+            ]
+            # Call agent connectors on these initial obs.
+            processed = policy.agent_connectors(acd_list)
+            for d in processed:
+                episode.add_init_obs(
+                    agent_id=d.agent_id,
+                    init_obs=d.data.raw_dict[SampleBatch.NEXT_OBS],
+                    init_infos=d.data.raw_dict[SampleBatch.INFOS],
+                    t=d.data.raw_dict[SampleBatch.T],
+                )
+                to_eval[policy_id].append(d)
+    def _handle_done_episode(
+        self,
+        env_id: EnvID,
+        env_obs_or_exception: MultiAgentDict,
+        is_done: bool,
+        active_envs: Set[EnvID],
+        to_eval: Dict[PolicyID, List[AgentConnectorDataType]],
+        outputs: List[SampleBatchType],
+    ) -> None:
+        """Handle an all-finished episode.
+        Add collected SampleBatch to batch builder. Reset corresponding env, etc.
+        Args:
+            env_id: Environment ID.
+            env_obs_or_exception: Last per-environment observation or Exception.
+            env_infos: Last per-environment infos.
+            is_done: If all agents are done.
+            active_envs: Set of active env ids.
+            to_eval: Output container for policy eval data.
+            outputs: Output container for collected sample batches.
+        """
+        if isinstance(env_obs_or_exception, Exception):
+            episode_or_exception: Exception = env_obs_or_exception
+            # Tell the sampler we have got a faulty episode.
+            outputs.append(RolloutMetrics(episode_faulty=True))
+        else:
+            episode_or_exception: EpisodeV2 = self._active_episodes[env_id]
+            # Add rollout metrics.
+            outputs.extend(
+                self._get_rollout_metrics(
+                    episode_or_exception, policy_map=self._worker.policy_map
+                )
+            )
+            # Output the collected episode after adding rollout metrics so that we
+            # always fetch metrics with RolloutWorker before we fetch samples.
+            # This is because we need to behave like env_runner() for now.
+            self._build_done_episode(env_id, is_done, outputs)
+        # Clean up and deleted the post-processed episode now that we have collected
+        # its data.
+        self.end_episode(env_id, episode_or_exception)
+        # Create a new episode instance (before we reset the sub-environment).
+        new_episode: EpisodeV2 = self.create_episode(env_id)
+        # The sub environment at index `env_id` might throw an exception
+        # during the following `try_reset()` attempt. If configured with
+        # `restart_failed_sub_environments=True`, the BaseEnv will restart
+        # the affected sub environment (create a new one using its c'tor) and
+        # must reset the recreated sub env right after that.
+        # Should the sub environment fail indefinitely during these
+        # repeated reset attempts, the entire worker will be blocked.
+        # This would be ok, b/c the alternative would be the worker crashing
+        # entirely.
+        while True:
+            resetted_obs, resetted_infos = self._base_env.try_reset(env_id)
+            if (
+                resetted_obs is None
+                or resetted_obs == ASYNC_RESET_RETURN
+                or not isinstance(resetted_obs[env_id], Exception)
+            ):
+                break
+            else:
+                # Report a faulty episode.
+                outputs.append(RolloutMetrics(episode_faulty=True))
+        # Reset connector state if this is a hard reset.
+        for p in self._worker.policy_map.cache.values():
+            p.agent_connectors.reset(env_id)
+        # Creates a new episode if this is not async return.
+        # If reset is async, we will get its result in some future poll.
+        if resetted_obs is not None and resetted_obs != ASYNC_RESET_RETURN:
+            self._active_episodes[env_id] = new_episode
+            self._call_on_episode_start(new_episode, env_id)
+            self.__process_resetted_obs_for_eval(
+                env_id,
+                resetted_obs,
+                resetted_infos,
+                new_episode,
+                to_eval,
+            )
+            # Step after adding initial obs. This will give us 0 env and agent step.
+            new_episode.step()
+            active_envs.add(env_id)
+    def create_episode(self, env_id: EnvID) -> EpisodeV2:
+        """Creates a new EpisodeV2 instance and returns it.
+        Calls `on_episode_created` callbacks, but does NOT reset the respective
+        sub-environment yet.
+        Args:
+            env_id: Env ID.
+        Returns:
+            The newly created EpisodeV2 instance.
+        """
+        # Make sure we currently don't have an active episode under this env ID.
+        assert env_id not in self._active_episodes
+        # Create a new episode under the same `env_id` and call the
+        # `on_episode_created` callbacks.
+        new_episode = EpisodeV2(
+            env_id,
+            self._worker.policy_map,
+            self._worker.policy_mapping_fn,
+            worker=self._worker,
+            callbacks=self._callbacks,
+        )
+        # Call `on_episode_created()` callback.
+        self._callbacks.on_episode_created(
+            worker=self._worker,
+            base_env=self._base_env,
+            policies=self._worker.policy_map,
+            env_index=env_id,
+            episode=new_episode,
+        )
+        return new_episode
+    def end_episode(
+        self, env_id: EnvID, episode_or_exception: Union[EpisodeV2, Exception]
+    ):
+        """Cleans up an episode that has finished.
+        Args:
+            env_id: Env ID.
+            episode_or_exception: Instance of an episode if it finished successfully.
+                Otherwise, the exception that was thrown,
+        """
+        # Signal the end of an episode, either successfully with an Episode or
+        # unsuccessfully with an Exception.
+        self._callbacks.on_episode_end(
+            worker=self._worker,
+            base_env=self._base_env,
+            policies=self._worker.policy_map,
+            episode=episode_or_exception,
+            env_index=env_id,
+        )
+        # Call each (in-memory) policy's Exploration.on_episode_end
+        # method.
+        # Note: This may break the exploration (e.g. ParameterNoise) of
+        # policies in the `policy_map` that have not been recently used
+        # (and are therefore stashed to disk). However, we certainly do not
+        # want to loop through all (even stashed) policies here as that
+        # would counter the purpose of the LRU policy caching.
+        for p in self._worker.policy_map.cache.values():
+            if getattr(p, "exploration", None) is not None:
+                p.exploration.on_episode_end(
+                    policy=p,
+                    environment=self._base_env,
+                    episode=episode_or_exception,
+                    tf_sess=p.get_session(),
+                )
+        if isinstance(episode_or_exception, EpisodeV2):
+            episode = episode_or_exception
+            if episode.total_agent_steps == 0:
+                # if the key does not exist it means that throughout the episode all
+                # observations were empty (i.e. there was no agent in the env)
+                msg = (
+                    f"Data from episode {episode.episode_id} does not show any agent "
+                    f"interactions. Hint: Make sure for at least one timestep in the "
+                    f"episode, env.step() returns non-empty values."
+                )
+                raise ValueError(msg)
+        # Clean up the episode and batch_builder for this env id.
+        if env_id in self._active_episodes:
+            del self._active_episodes[env_id]
+    def _try_build_truncated_episode_multi_agent_batch(
+        self, batch_builder: _PolicyCollectorGroup, episode: EpisodeV2
+    ) -> Union[None, SampleBatch, MultiAgentBatch]:
+        # Measure batch size in env-steps.
+        if self._count_steps_by == "env_steps":
+            built_steps = batch_builder.env_steps
+            ongoing_steps = episode.active_env_steps
+        # Measure batch-size in agent-steps.
+        else:
+            built_steps = batch_builder.agent_steps
+            ongoing_steps = episode.active_agent_steps
+        # Reached the fragment-len -> We should build an MA-Batch.
+        if built_steps + ongoing_steps >= self._rollout_fragment_length:
+            if self._count_steps_by != "agent_steps":
+                assert built_steps + ongoing_steps == self._rollout_fragment_length, (
+                    f"built_steps ({built_steps}) + ongoing_steps ({ongoing_steps}) != "
+                    f"rollout_fragment_length ({self._rollout_fragment_length})."
+                )
+            # If we reached the fragment-len only because of `episode_id`
+            # (still ongoing) -> postprocess `episode_id` first.
+            if built_steps < self._rollout_fragment_length:
+                episode.postprocess_episode(batch_builder=batch_builder, is_done=False)
+            # If builder has collected some data,
+            # build the MA-batch and add to return values.
+            if batch_builder.agent_steps > 0:
+                return _build_multi_agent_batch(
+                    episode.episode_id,
+                    batch_builder,
+                    self._large_batch_threshold,
+                    self._multiple_episodes_in_batch,
+                )
+            # No batch-builder:
+            # We have reached the rollout-fragment length w/o any agent
+            # steps! Warn that the environment may never request any
+            # actions from any agents.
+            elif log_once("no_agent_steps"):
+                logger.warning(
+                    "Your environment seems to be stepping w/o ever "
+                    "emitting agent observations (agents are never "
+                    "requested to act)!"
+                )
+        return None
+    def _do_policy_eval(
+        self,
+        to_eval: Dict[PolicyID, List[AgentConnectorDataType]],
+    ) -> Dict[PolicyID, PolicyOutputType]:
+        """Call compute_actions on collected episode data to get next action.
+        Args:
+            to_eval: Mapping of policy IDs to lists of AgentConnectorDataType objects
+                (items in these lists will be the batch's items for the model
+                forward pass).
+        Returns:
+            Dict mapping PolicyIDs to compute_actions_from_input_dict() outputs.
+        """
+        policies = self._worker.policy_map
+        # In case policy map has changed, try to find the new policy that
+        # should handle all these per-agent eval data.
+        # Throws exception if these agents are mapped to multiple different
+        # policies now.
+        def _try_find_policy_again(eval_data: AgentConnectorDataType):
+            policy_id = None
+            for d in eval_data:
+                episode = self._active_episodes[d.env_id]
+                # Force refresh policy mapping on the episode.
+                pid = episode.policy_for(d.agent_id, refresh=True)
+                if policy_id is not None and pid != policy_id:
+                    raise ValueError(
+                        "Policy map changed. The list of eval data that was handled "
+                        f"by a same policy is now handled by policy {pid} "
+                        "and {policy_id}. "
+                        "Please don't do this in the middle of an episode."
+                    )
+                policy_id = pid
+            return _get_or_raise(self._worker.policy_map, policy_id)
+        eval_results: Dict[PolicyID, TensorStructType] = {}
+        for policy_id, eval_data in to_eval.items():
+            # In case the policyID has been removed from this worker, we need to
+            # re-assign policy_id and re-lookup the Policy object to use.
+            try:
+                policy: Policy = _get_or_raise(policies, policy_id)
+            except ValueError:
+                # policy_mapping_fn from the worker may have already been
+                # changed (mapping fn not staying constant within one episode).
+                policy: Policy = _try_find_policy_again(eval_data)
+            input_dict = _batch_inference_sample_batches(
+                [d.data.sample_batch for d in eval_data]
+            )
+            eval_results[policy_id] = policy.compute_actions_from_input_dict(
+                input_dict,
+                timestep=policy.global_timestep,
+                episodes=[self._active_episodes[t.env_id] for t in eval_data],
+            )
+        return eval_results
+    def _process_policy_eval_results(
+        self,
+        active_envs: Set[EnvID],
+        to_eval: Dict[PolicyID, List[AgentConnectorDataType]],
+        eval_results: Dict[PolicyID, PolicyOutputType],
+        off_policy_actions: MultiEnvDict,
+    ):
+        """Process the output of policy neural network evaluation.
+        Records policy evaluation results into agent connectors and
+        returns replies to send back to agents in the env.
+        Args:
+            active_envs: Set of env IDs that are still active.
+            to_eval: Mapping of policy IDs to lists of AgentConnectorDataType objects.
+            eval_results: Mapping of policy IDs to list of
+                actions, rnn-out states, extra-action-fetches dicts.
+            off_policy_actions: Doubly keyed dict of env-ids -> agent ids ->
+                off-policy-action, returned by a `BaseEnv.poll()` call.
+        Returns:
+            Nested dict of env id -> agent id -> actions to be sent to
+            Env (np.ndarrays).
+        """
+        actions_to_send: Dict[EnvID, Dict[AgentID, EnvActionType]] = defaultdict(dict)
+        for env_id in active_envs:
+            actions_to_send[env_id] = {}  # at minimum send empty dict
+        # types: PolicyID, List[AgentConnectorDataType]
+        for policy_id, eval_data in to_eval.items():
+            actions: TensorStructType = eval_results[policy_id][0]
+            actions = convert_to_numpy(actions)
+            rnn_out: StateBatches = eval_results[policy_id][1]
+            extra_action_out: dict = eval_results[policy_id][2]
+            # In case actions is a list (representing the 0th dim of a batch of
+            # primitive actions), try converting it first.
+            if isinstance(actions, list):
+                actions = np.array(actions)
+            # Split action-component batches into single action rows.
+            actions: List[EnvActionType] = unbatch(actions)
+            policy: Policy = _get_or_raise(self._worker.policy_map, policy_id)
+            assert (
+                policy.agent_connectors and policy.action_connectors
+            ), "EnvRunnerV2 requires action connectors to work."
+            # types: int, EnvActionType
+            for i, action in enumerate(actions):
+                env_id: int = eval_data[i].env_id
+                agent_id: AgentID = eval_data[i].agent_id
+                input_dict: TensorStructType = eval_data[i].data.raw_dict
+                rnn_states: List[StateBatches] = tree.map_structure(
+                    lambda x, i=i: x[i], rnn_out
+                )
+                # extra_action_out could be a nested dict
+                fetches: Dict = tree.map_structure(
+                    lambda x, i=i: x[i], extra_action_out
+                )
+                # Post-process policy output by running them through action connectors.
+                ac_data = ActionConnectorDataType(
+                    env_id, agent_id, input_dict, (action, rnn_states, fetches)
+                )
+                action_to_send, rnn_states, fetches = policy.action_connectors(
+                    ac_data
+                ).output
+                # The action we want to buffer is the direct output of
+                # compute_actions_from_input_dict() here. This is because we want to
+                # send the unsqushed actions to the environment while learning and
+                # possibly basing subsequent actions on the squashed actions.
+                action_to_buffer = (
+                    action
+                    if env_id not in off_policy_actions
+                    or agent_id not in off_policy_actions[env_id]
+                    else off_policy_actions[env_id][agent_id]
+                )
+                # Notify agent connectors with this new policy output.
+                # Necessary for state buffering agent connectors, for example.
+                ac_data: ActionConnectorDataType = ActionConnectorDataType(
+                    env_id,
+                    agent_id,
+                    input_dict,
+                    (action_to_buffer, rnn_states, fetches),
+                )
+                policy.agent_connectors.on_policy_output(ac_data)
+                assert agent_id not in actions_to_send[env_id]
+                actions_to_send[env_id][agent_id] = action_to_send
+        return actions_to_send
+    def _maybe_render(self):
+        """Visualize environment."""
+        # Check if we should render.
+        if not self._render or not self._simple_image_viewer:
+            return
+        t5 = time.time()
+        # Render can either return an RGB image (uint8 [w x h x 3] numpy
+        # array) or take care of rendering itself (returning True).
+        rendered = self._base_env.try_render()
+        # Rendering returned an image -> Display it in a SimpleImageViewer.
+        if isinstance(rendered, np.ndarray) and len(rendered.shape) == 3:
+            self._simple_image_viewer.imshow(rendered)
+        elif rendered not in [True, False, None]:
+            raise ValueError(
+                f"The env's ({self._base_env}) `try_render()` method returned an"
+                " unsupported value! Make sure you either return a "
+                "uint8/w x h x 3 (RGB) image or handle rendering in a "
+                "window and then return `True`."
+            )
+        self._perf_stats.incr("env_render_time", time.time() - t5)
+def _fetch_atari_metrics(base_env: BaseEnv) -> List[RolloutMetrics]:
+    """Atari games have multiple logical episodes, one per life.
+    However, for metrics reporting we count full episodes, all lives included.
+    """
+    sub_environments = base_env.get_sub_environments()
+    if not sub_environments:
+        return None
+    atari_out = []
+    for sub_env in sub_environments:
+        monitor = get_wrapper_by_cls(sub_env, MonitorEnv)
+        if not monitor:
+            return None
+        for eps_rew, eps_len in monitor.next_episode_results():
+            atari_out.append(RolloutMetrics(eps_len, eps_rew))
+    return atari_out
+def _get_or_raise(
+    mapping: Dict[PolicyID, Union[Policy, Preprocessor, Filter]], policy_id: PolicyID
+) -> Union[Policy, Preprocessor, Filter]:
+    """Returns an object under key `policy_id` in `mapping`.
+    Args:
+        mapping (Dict[PolicyID, Union[Policy, Preprocessor, Filter]]): The
+            mapping dict from policy id (str) to actual object (Policy,
+            Preprocessor, etc.).
+        policy_id: The policy ID to lookup.
+    Returns:
+        Union[Policy, Preprocessor, Filter]: The found object.
+    Raises:
+        ValueError: If `policy_id` cannot be found in `mapping`.
+    """
+    if policy_id not in mapping:
+        raise ValueError(
+            "Could not find policy for agent: PolicyID `{}` not found "
+            "in policy map, whose keys are `{}`.".format(policy_id, mapping.keys())
+        )
+    return mapping[policy_id]

.venv/lib/python3.11/site-packages/ray/rllib/evaluation/episode_v2.py ADDED Viewed

	@@ -0,0 +1,378 @@

+import random
+from collections import defaultdict
+import numpy as np
+from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Tuple
+from ray.rllib.env.base_env import _DUMMY_AGENT_ID
+from ray.rllib.evaluation.collectors.simple_list_collector import (
+    _PolicyCollector,
+    _PolicyCollectorGroup,
+)
+from ray.rllib.evaluation.collectors.agent_collector import AgentCollector
+from ray.rllib.policy.policy_map import PolicyMap
+from ray.rllib.policy.sample_batch import SampleBatch
+from ray.rllib.utils.annotations import OldAPIStack
+from ray.rllib.utils.typing import AgentID, EnvID, EnvInfoDict, PolicyID, TensorType
+if TYPE_CHECKING:
+    from ray.rllib.callbacks.callbacks import RLlibCallback
+    from ray.rllib.evaluation.rollout_worker import RolloutWorker
+@OldAPIStack
+class EpisodeV2:
+    """Tracks the current state of a (possibly multi-agent) episode."""
+    def __init__(
+        self,
+        env_id: EnvID,
+        policies: PolicyMap,
+        policy_mapping_fn: Callable[[AgentID, "EpisodeV2", "RolloutWorker"], PolicyID],
+        *,
+        worker: Optional["RolloutWorker"] = None,
+        callbacks: Optional["RLlibCallback"] = None,
+    ):
+        """Initializes an Episode instance.
+        Args:
+            env_id: The environment's ID in which this episode runs.
+            policies: The PolicyMap object (mapping PolicyIDs to Policy
+                objects) to use for determining, which policy is used for
+                which agent.
+            policy_mapping_fn: The mapping function mapping AgentIDs to
+                PolicyIDs.
+            worker: The RolloutWorker instance, in which this episode runs.
+        """
+        # Unique id identifying this trajectory.
+        self.episode_id: int = random.randrange(int(1e18))
+        # ID of the environment this episode is tracking.
+        self.env_id = env_id
+        # Summed reward across all agents in this episode.
+        self.total_reward: float = 0.0
+        # Active (uncollected) # of env steps taken by this episode.
+        # Start from -1. After add_init_obs(), we will be at 0 step.
+        self.active_env_steps: int = -1
+        # Total # of env steps taken by this episode.
+        # Start from -1, After add_init_obs(), we will be at 0 step.
+        self.total_env_steps: int = -1
+        # Active (uncollected) agent steps.
+        self.active_agent_steps: int = 0
+        # Total # of steps take by all agents in this env.
+        self.total_agent_steps: int = 0
+        # Dict for user to add custom metrics.
+        # TODO (sven): We should probably unify custom_metrics, user_data,
+        #  and hist_data into a single data container for user to track per-step.
+        # metrics and states.
+        self.custom_metrics: Dict[str, float] = {}
+        # Temporary storage. E.g. storing data in between two custom
+        # callbacks referring to the same episode.
+        self.user_data: Dict[str, Any] = {}
+        # Dict mapping str keys to List[float] for storage of
+        # per-timestep float data throughout the episode.
+        self.hist_data: Dict[str, List[float]] = {}
+        self.media: Dict[str, Any] = {}
+        self.worker = worker
+        self.callbacks = callbacks
+        self.policy_map: PolicyMap = policies
+        self.policy_mapping_fn: Callable[
+            [AgentID, "EpisodeV2", "RolloutWorker"], PolicyID
+        ] = policy_mapping_fn
+        # Per-agent data collectors.
+        self._agent_to_policy: Dict[AgentID, PolicyID] = {}
+        self._agent_collectors: Dict[AgentID, AgentCollector] = {}
+        self._next_agent_index: int = 0
+        self._agent_to_index: Dict[AgentID, int] = {}
+        # Summed rewards broken down by agent.
+        self.agent_rewards: Dict[Tuple[AgentID, PolicyID], float] = defaultdict(float)
+        self._agent_reward_history: Dict[AgentID, List[int]] = defaultdict(list)
+        self._has_init_obs: Dict[AgentID, bool] = {}
+        self._last_terminateds: Dict[AgentID, bool] = {}
+        self._last_truncateds: Dict[AgentID, bool] = {}
+        # Keep last info dict around, in case an environment tries to signal
+        # us something.
+        self._last_infos: Dict[AgentID, Dict] = {}
+    def policy_for(
+        self, agent_id: AgentID = _DUMMY_AGENT_ID, refresh: bool = False
+    ) -> PolicyID:
+        """Returns and stores the policy ID for the specified agent.
+        If the agent is new, the policy mapping fn will be called to bind the
+        agent to a policy for the duration of the entire episode (even if the
+        policy_mapping_fn is changed in the meantime!).
+        Args:
+            agent_id: The agent ID to lookup the policy ID for.
+        Returns:
+            The policy ID for the specified agent.
+        """
+        # Perform a new policy_mapping_fn lookup and bind AgentID for the
+        # duration of this episode to the returned PolicyID.
+        if agent_id not in self._agent_to_policy or refresh:
+            policy_id = self._agent_to_policy[agent_id] = self.policy_mapping_fn(
+                agent_id,  # agent_id
+                self,  # episode
+                worker=self.worker,
+            )
+        # Use already determined PolicyID.
+        else:
+            policy_id = self._agent_to_policy[agent_id]
+        # PolicyID not found in policy map -> Error.
+        if policy_id not in self.policy_map:
+            raise KeyError(
+                "policy_mapping_fn returned invalid policy id " f"'{policy_id}'!"
+            )
+        return policy_id
+    def get_agents(self) -> List[AgentID]:
+        """Returns list of agent IDs that have appeared in this episode.
+        Returns:
+            The list of all agent IDs that have appeared so far in this
+            episode.
+        """
+        return list(self._agent_to_index.keys())
+    def agent_index(self, agent_id: AgentID) -> int:
+        """Get the index of an agent among its environment.
+        A new index will be created if an agent is seen for the first time.
+        Args:
+            agent_id: ID of an agent.
+        Returns:
+            The index of this agent.
+        """
+        if agent_id not in self._agent_to_index:
+            self._agent_to_index[agent_id] = self._next_agent_index
+            self._next_agent_index += 1
+        return self._agent_to_index[agent_id]
+    def step(self) -> None:
+        """Advance the episode forward by one step."""
+        self.active_env_steps += 1
+        self.total_env_steps += 1
+    def add_init_obs(
+        self,
+        *,
+        agent_id: AgentID,
+        init_obs: TensorType,
+        init_infos: Dict[str, TensorType],
+        t: int = -1,
+    ) -> None:
+        """Add initial env obs at the start of a new episode
+        Args:
+            agent_id: Agent ID.
+            init_obs: Initial observations.
+            init_infos: Initial infos dicts.
+            t: timestamp.
+        """
+        policy = self.policy_map[self.policy_for(agent_id)]
+        # Add initial obs to Trajectory.
+        assert agent_id not in self._agent_collectors
+        self._agent_collectors[agent_id] = AgentCollector(
+            policy.view_requirements,
+            max_seq_len=policy.config["model"]["max_seq_len"],
+            disable_action_flattening=policy.config.get(
+                "_disable_action_flattening", False
+            ),
+            is_policy_recurrent=policy.is_recurrent(),
+            intial_states=policy.get_initial_state(),
+            _enable_new_api_stack=False,
+        )
+        self._agent_collectors[agent_id].add_init_obs(
+            episode_id=self.episode_id,
+            agent_index=self.agent_index(agent_id),
+            env_id=self.env_id,
+            init_obs=init_obs,
+            init_infos=init_infos,
+            t=t,
+        )
+        self._has_init_obs[agent_id] = True
+    def add_action_reward_done_next_obs(
+        self,
+        agent_id: AgentID,
+        values: Dict[str, TensorType],
+    ) -> None:
+        """Add action, reward, info, and next_obs as a new step.
+        Args:
+            agent_id: Agent ID.
+            values: Dict of action, reward, info, and next_obs.
+        """
+        # Make sure, agent already has some (at least init) data.
+        assert agent_id in self._agent_collectors
+        self.active_agent_steps += 1
+        self.total_agent_steps += 1
+        # Include the current agent id for multi-agent algorithms.
+        if agent_id != _DUMMY_AGENT_ID:
+            values["agent_id"] = agent_id
+        # Add action/reward/next-obs (and other data) to Trajectory.
+        self._agent_collectors[agent_id].add_action_reward_next_obs(values)
+        # Keep track of agent reward history.
+        reward = values[SampleBatch.REWARDS]
+        self.total_reward += reward
+        self.agent_rewards[(agent_id, self.policy_for(agent_id))] += reward
+        self._agent_reward_history[agent_id].append(reward)
+        # Keep track of last terminated info for agent.
+        if SampleBatch.TERMINATEDS in values:
+            self._last_terminateds[agent_id] = values[SampleBatch.TERMINATEDS]
+        # Keep track of last truncated info for agent.
+        if SampleBatch.TRUNCATEDS in values:
+            self._last_truncateds[agent_id] = values[SampleBatch.TRUNCATEDS]
+        # Keep track of last info dict if available.
+        if SampleBatch.INFOS in values:
+            self.set_last_info(agent_id, values[SampleBatch.INFOS])
+    def postprocess_episode(
+        self,
+        batch_builder: _PolicyCollectorGroup,
+        is_done: bool = False,
+        check_dones: bool = False,
+    ) -> None:
+        """Build and return currently collected training samples by policies.
+        Clear agent collector states if this episode is done.
+        Args:
+            batch_builder: _PolicyCollectorGroup for saving the collected per-agent
+                sample batches.
+            is_done: If this episode is done (terminated or truncated).
+            check_dones: Whether to make sure per-agent trajectories are actually done.
+        """
+        # TODO: (sven) Once we implement multi-agent communication channels,
+        #  we have to resolve the restriction of only sending other agent
+        #  batches from the same policy to the postprocess methods.
+        # Build SampleBatches for the given episode.
+        pre_batches = {}
+        for agent_id, collector in self._agent_collectors.items():
+            # Build only if there is data and agent is part of given episode.
+            if collector.agent_steps == 0:
+                continue
+            pid = self.policy_for(agent_id)
+            policy = self.policy_map[pid]
+            pre_batch = collector.build_for_training(policy.view_requirements)
+            pre_batches[agent_id] = (pid, policy, pre_batch)
+        for agent_id, (pid, policy, pre_batch) in pre_batches.items():
+            # Entire episode is said to be done.
+            # Error if no DONE at end of this agent's trajectory.
+            if is_done and check_dones and not pre_batch.is_terminated_or_truncated():
+                raise ValueError(
+                    "Episode {} terminated for all agents, but we still "
+                    "don't have a last observation for agent {} (policy "
+                    "{}). ".format(self.episode_id, agent_id, self.policy_for(agent_id))
+                    + "Please ensure that you include the last observations "
+                    "of all live agents when setting done[__all__] to "
+                    "True."
+                )
+            # Skip a trajectory's postprocessing (and thus using it for training),
+            # if its agent's info exists and contains the training_enabled=False
+            # setting (used by our PolicyClients).
+            if not self._last_infos.get(agent_id, {}).get("training_enabled", True):
+                continue
+            if (
+                not pre_batch.is_single_trajectory()
+                or len(np.unique(pre_batch[SampleBatch.EPS_ID])) > 1
+            ):
+                raise ValueError(
+                    "Batches sent to postprocessing must only contain steps "
+                    "from a single trajectory.",
+                    pre_batch,
+                )
+            if len(pre_batches) > 1:
+                other_batches = pre_batches.copy()
+                del other_batches[agent_id]
+            else:
+                other_batches = {}
+            # Call the Policy's Exploration's postprocess method.
+            post_batch = pre_batch
+            if getattr(policy, "exploration", None) is not None:
+                policy.exploration.postprocess_trajectory(
+                    policy, post_batch, policy.get_session()
+                )
+            post_batch.set_get_interceptor(None)
+            post_batch = policy.postprocess_trajectory(post_batch, other_batches, self)
+            from ray.rllib.evaluation.rollout_worker import get_global_worker
+            self.callbacks.on_postprocess_trajectory(
+                worker=get_global_worker(),
+                episode=self,
+                agent_id=agent_id,
+                policy_id=pid,
+                policies=self.policy_map,
+                postprocessed_batch=post_batch,
+                original_batches=pre_batches,
+            )
+            # Append post_batch for return.
+            if pid not in batch_builder.policy_collectors:
+                batch_builder.policy_collectors[pid] = _PolicyCollector(policy)
+            batch_builder.policy_collectors[pid].add_postprocessed_batch_for_training(
+                post_batch, policy.view_requirements
+            )
+        batch_builder.agent_steps += self.active_agent_steps
+        batch_builder.env_steps += self.active_env_steps
+        # AgentCollector cleared.
+        self.active_agent_steps = 0
+        self.active_env_steps = 0
+    def has_init_obs(self, agent_id: AgentID = None) -> bool:
+        """Returns whether this episode has initial obs for an agent.
+        If agent_id is None, return whether we have received any initial obs,
+        in other words, whether this episode is completely fresh.
+        """
+        if agent_id is not None:
+            return agent_id in self._has_init_obs and self._has_init_obs[agent_id]
+        else:
+            return any(list(self._has_init_obs.values()))
+    def is_done(self, agent_id: AgentID) -> bool:
+        return self.is_terminated(agent_id) or self.is_truncated(agent_id)
+    def is_terminated(self, agent_id: AgentID) -> bool:
+        return self._last_terminateds.get(agent_id, False)
+    def is_truncated(self, agent_id: AgentID) -> bool:
+        return self._last_truncateds.get(agent_id, False)
+    def set_last_info(self, agent_id: AgentID, info: Dict):
+        self._last_infos[agent_id] = info
+    def last_info_for(
+        self, agent_id: AgentID = _DUMMY_AGENT_ID
+    ) -> Optional[EnvInfoDict]:
+        return self._last_infos.get(agent_id)
+    @property
+    def length(self):
+        return self.total_env_steps

.venv/lib/python3.11/site-packages/ray/rllib/evaluation/metrics.py ADDED Viewed

	@@ -0,0 +1,266 @@

+import collections
+import logging
+import numpy as np
+from typing import List, Optional, TYPE_CHECKING
+from ray.rllib.policy.sample_batch import DEFAULT_POLICY_ID
+from ray.rllib.utils.annotations import OldAPIStack
+from ray.rllib.utils.metrics.learner_info import LEARNER_STATS_KEY
+from ray.rllib.utils.typing import GradInfoDict, LearnerStatsDict, ResultDict
+if TYPE_CHECKING:
+    from ray.rllib.env.env_runner_group import EnvRunnerGroup
+logger = logging.getLogger(__name__)
+RolloutMetrics = OldAPIStack(
+    collections.namedtuple(
+        "RolloutMetrics",
+        [
+            "episode_length",
+            "episode_reward",
+            "agent_rewards",
+            "custom_metrics",
+            "perf_stats",
+            "hist_data",
+            "media",
+            "episode_faulty",
+            "connector_metrics",
+        ],
+    )
+)
+RolloutMetrics.__new__.__defaults__ = (0, 0, {}, {}, {}, {}, {}, False, {})
+@OldAPIStack
+def get_learner_stats(grad_info: GradInfoDict) -> LearnerStatsDict:
+    """Return optimization stats reported from the policy.
+    .. testcode::
+        :skipif: True
+        grad_info = worker.learn_on_batch(samples)
+        # {"td_error": [...], "learner_stats": {"vf_loss": ..., ...}}
+        print(get_stats(grad_info))
+    .. testoutput::
+        {"vf_loss": ..., "policy_loss": ...}
+    """
+    if LEARNER_STATS_KEY in grad_info:
+        return grad_info[LEARNER_STATS_KEY]
+    multiagent_stats = {}
+    for k, v in grad_info.items():
+        if type(v) is dict:
+            if LEARNER_STATS_KEY in v:
+                multiagent_stats[k] = v[LEARNER_STATS_KEY]
+    return multiagent_stats
+@OldAPIStack
+def collect_metrics(
+    workers: "EnvRunnerGroup",
+    remote_worker_ids: Optional[List[int]] = None,
+    timeout_seconds: int = 180,
+    keep_custom_metrics: bool = False,
+) -> ResultDict:
+    """Gathers episode metrics from rollout worker set.
+    Args:
+        workers: EnvRunnerGroup.
+        remote_worker_ids: Optional list of IDs of remote workers to collect
+            metrics from.
+        timeout_seconds: Timeout in seconds for collecting metrics from remote workers.
+        keep_custom_metrics: Whether to keep custom metrics in the result dict as
+            they are (True) or to aggregate them (False).
+    Returns:
+        A result dict of metrics.
+    """
+    episodes = collect_episodes(
+        workers, remote_worker_ids, timeout_seconds=timeout_seconds
+    )
+    metrics = summarize_episodes(
+        episodes, episodes, keep_custom_metrics=keep_custom_metrics
+    )
+    return metrics
+@OldAPIStack
+def collect_episodes(
+    workers: "EnvRunnerGroup",
+    remote_worker_ids: Optional[List[int]] = None,
+    timeout_seconds: int = 180,
+) -> List[RolloutMetrics]:
+    """Gathers new episodes metrics tuples from the given RolloutWorkers.
+    Args:
+        workers: EnvRunnerGroup.
+        remote_worker_ids: Optional list of IDs of remote workers to collect
+            metrics from.
+        timeout_seconds: Timeout in seconds for collecting metrics from remote workers.
+    Returns:
+        List of RolloutMetrics.
+    """
+    # This will drop get_metrics() calls that are too slow.
+    # We can potentially make this an asynchronous call if this turns
+    # out to be a problem.
+    metric_lists = workers.foreach_env_runner(
+        lambda w: w.get_metrics(),
+        local_env_runner=True,
+        remote_worker_ids=remote_worker_ids,
+        timeout_seconds=timeout_seconds,
+    )
+    if len(metric_lists) == 0:
+        logger.warning("WARNING: collected no metrics.")
+    episodes = []
+    for metrics in metric_lists:
+        episodes.extend(metrics)
+    return episodes
+@OldAPIStack
+def summarize_episodes(
+    episodes: List[RolloutMetrics],
+    new_episodes: List[RolloutMetrics] = None,
+    keep_custom_metrics: bool = False,
+) -> ResultDict:
+    """Summarizes a set of episode metrics tuples.
+    Args:
+        episodes: List of most recent n episodes. This may include historical ones
+            (not newly collected in this iteration) in order to achieve the size of
+            the smoothing window.
+        new_episodes: All the episodes that were completed in this iteration.
+        keep_custom_metrics: Whether to keep custom metrics in the result dict as
+            they are (True) or to aggregate them (False).
+    Returns:
+        A result dict of metrics.
+    """
+    if new_episodes is None:
+        new_episodes = episodes
+    episode_rewards = []
+    episode_lengths = []
+    policy_rewards = collections.defaultdict(list)
+    custom_metrics = collections.defaultdict(list)
+    perf_stats = collections.defaultdict(list)
+    hist_stats = collections.defaultdict(list)
+    episode_media = collections.defaultdict(list)
+    connector_metrics = collections.defaultdict(list)
+    num_faulty_episodes = 0
+    for episode in episodes:
+        # Faulty episodes may still carry perf_stats data.
+        for k, v in episode.perf_stats.items():
+            perf_stats[k].append(v)
+        # Continue if this is a faulty episode.
+        # There should be other meaningful stats to be collected.
+        if episode.episode_faulty:
+            num_faulty_episodes += 1
+            continue
+        episode_lengths.append(episode.episode_length)
+        episode_rewards.append(episode.episode_reward)
+        for k, v in episode.custom_metrics.items():
+            custom_metrics[k].append(v)
+        is_multi_agent = (
+            len(episode.agent_rewards) > 1
+            or DEFAULT_POLICY_ID not in episode.agent_rewards
+        )
+        if is_multi_agent:
+            for (_, policy_id), reward in episode.agent_rewards.items():
+                policy_rewards[policy_id].append(reward)
+        for k, v in episode.hist_data.items():
+            hist_stats[k] += v
+        for k, v in episode.media.items():
+            episode_media[k].append(v)
+        if hasattr(episode, "connector_metrics"):
+            # Group connector metrics by connector_metric name for all policies
+            for per_pipeline_metrics in episode.connector_metrics.values():
+                for per_connector_metrics in per_pipeline_metrics.values():
+                    for connector_metric_name, val in per_connector_metrics.items():
+                        connector_metrics[connector_metric_name].append(val)
+    if episode_rewards:
+        min_reward = min(episode_rewards)
+        max_reward = max(episode_rewards)
+        avg_reward = np.mean(episode_rewards)
+    else:
+        min_reward = float("nan")
+        max_reward = float("nan")
+        avg_reward = float("nan")
+    if episode_lengths:
+        avg_length = np.mean(episode_lengths)
+    else:
+        avg_length = float("nan")
+    # Show as histogram distributions.
+    hist_stats["episode_reward"] = episode_rewards
+    hist_stats["episode_lengths"] = episode_lengths
+    policy_reward_min = {}
+    policy_reward_mean = {}
+    policy_reward_max = {}
+    for policy_id, rewards in policy_rewards.copy().items():
+        policy_reward_min[policy_id] = np.min(rewards)
+        policy_reward_mean[policy_id] = np.mean(rewards)
+        policy_reward_max[policy_id] = np.max(rewards)
+        # Show as histogram distributions.
+        hist_stats["policy_{}_reward".format(policy_id)] = rewards
+    for k, v_list in custom_metrics.copy().items():
+        filt = [v for v in v_list if not np.any(np.isnan(v))]
+        if keep_custom_metrics:
+            custom_metrics[k] = filt
+        else:
+            custom_metrics[k + "_mean"] = np.mean(filt)
+            if filt:
+                custom_metrics[k + "_min"] = np.min(filt)
+                custom_metrics[k + "_max"] = np.max(filt)
+            else:
+                custom_metrics[k + "_min"] = float("nan")
+                custom_metrics[k + "_max"] = float("nan")
+            del custom_metrics[k]
+    for k, v_list in perf_stats.copy().items():
+        perf_stats[k] = np.mean(v_list)
+    mean_connector_metrics = dict()
+    for k, v_list in connector_metrics.items():
+        mean_connector_metrics[k] = np.mean(v_list)
+    return dict(
+        episode_reward_max=max_reward,
+        episode_reward_min=min_reward,
+        episode_reward_mean=avg_reward,
+        episode_len_mean=avg_length,
+        episode_media=dict(episode_media),
+        episodes_timesteps_total=sum(episode_lengths),
+        policy_reward_min=policy_reward_min,
+        policy_reward_max=policy_reward_max,
+        policy_reward_mean=policy_reward_mean,
+        custom_metrics=dict(custom_metrics),
+        hist_stats=dict(hist_stats),
+        sampler_perf=dict(perf_stats),
+        num_faulty_episodes=num_faulty_episodes,
+        connector_metrics=mean_connector_metrics,
+        # Added these (duplicate) values here for forward compatibility with the new API
+        # stack's metrics structure. This allows us to unify our test cases and keeping
+        # the new API stack clean of backward-compatible keys.
+        num_episodes=len(new_episodes),
+        episode_return_max=max_reward,
+        episode_return_min=min_reward,
+        episode_return_mean=avg_reward,
+        episodes_this_iter=len(new_episodes),  # deprecate in favor of `num_epsodes_...`
+    )

.venv/lib/python3.11/site-packages/ray/rllib/evaluation/observation_function.py ADDED Viewed

	@@ -0,0 +1,87 @@

+from typing import Dict
+from ray.rllib.env import BaseEnv
+from ray.rllib.policy import Policy
+from ray.rllib.evaluation import RolloutWorker
+from ray.rllib.utils.annotations import OldAPIStack
+from ray.rllib.utils.framework import TensorType
+from ray.rllib.utils.typing import AgentID, PolicyID
+@OldAPIStack
+class ObservationFunction:
+    """Interceptor function for rewriting observations from the environment.
+    These callbacks can be used for preprocessing of observations, especially
+    in multi-agent scenarios.
+    Observation functions can be specified in the multi-agent config by
+    specifying ``{"observation_fn": your_obs_func}``. Note that
+    ``your_obs_func`` can be a plain Python function.
+    This API is **experimental**.
+    """
+    def __call__(
+        self,
+        agent_obs: Dict[AgentID, TensorType],
+        worker: RolloutWorker,
+        base_env: BaseEnv,
+        policies: Dict[PolicyID, Policy],
+        episode,
+        **kw
+    ) -> Dict[AgentID, TensorType]:
+        """Callback run on each environment step to observe the environment.
+        This method takes in the original agent observation dict returned by
+        a MultiAgentEnv, and returns a possibly modified one. It can be
+        thought of as a "wrapper" around the environment.
+        TODO(ekl): allow end-to-end differentiation through the observation
+            function and policy losses.
+        TODO(ekl): enable batch processing.
+        Args:
+            agent_obs: Dictionary of default observations from the
+                environment. The default implementation of observe() simply
+                returns this dict.
+            worker: Reference to the current rollout worker.
+            base_env: BaseEnv running the episode. The underlying
+                sub environment objects (BaseEnvs are vectorized) can be
+                retrieved by calling `base_env.get_sub_environments()`.
+            policies: Mapping of policy id to policy objects. In single
+                agent mode there will only be a single "default" policy.
+            episode: Episode state object.
+            kwargs: Forward compatibility placeholder.
+        Returns:
+            new_agent_obs: copy of agent obs with updates. You can
+                rewrite or drop data from the dict if needed (e.g., the env
+                can have a dummy "global" observation, and the observer can
+                merge the global state into individual observations.
+        .. testcode::
+            :skipif: True
+            # Observer that merges global state into individual obs. It is
+            # rewriting the discrete obs into a tuple with global state.
+            example_obs_fn1({"a": 1, "b": 2, "global_state": 101}, ...)
+        .. testoutput::
+            {"a": [1, 101], "b": [2, 101]}
+        .. testcode::
+            :skipif: True
+            # Observer for e.g., custom centralized critic model. It is
+            # rewriting the discrete obs into a dict with more data.
+            example_obs_fn2({"a": 1, "b": 2}, ...)
+        .. testoutput::
+            {"a": {"self": 1, "other": 2}, "b": {"self": 2, "other": 1}}
+        """
+        return agent_obs

.venv/lib/python3.11/site-packages/ray/rllib/evaluation/postprocessing.py ADDED Viewed

	@@ -0,0 +1,328 @@

+import numpy as np
+import scipy.signal
+from typing import Dict, Optional
+from ray.rllib.policy.policy import Policy
+from ray.rllib.policy.sample_batch import SampleBatch
+from ray.rllib.utils.annotations import DeveloperAPI, OldAPIStack
+from ray.rllib.utils.numpy import convert_to_numpy
+from ray.rllib.utils.typing import AgentID
+from ray.rllib.utils.typing import TensorType
+@DeveloperAPI
+class Postprocessing:
+    """Constant definitions for postprocessing."""
+    ADVANTAGES = "advantages"
+    VALUE_TARGETS = "value_targets"
+@OldAPIStack
+def adjust_nstep(n_step: int, gamma: float, batch: SampleBatch) -> None:
+    """Rewrites `batch` to encode n-step rewards, terminateds, truncateds, and next-obs.
+    Observations and actions remain unaffected. At the end of the trajectory,
+    n is truncated to fit in the traj length.
+    Args:
+        n_step: The number of steps to look ahead and adjust.
+        gamma: The discount factor.
+        batch: The SampleBatch to adjust (in place).
+    Examples:
+        n-step=3
+        Trajectory=o0 r0 d0, o1 r1 d1, o2 r2 d2, o3 r3 d3, o4 r4 d4=True o5
+        gamma=0.9
+        Returned trajectory:
+        0: o0 [r0 + 0.9*r1 + 0.9^2*r2 + 0.9^3*r3] d3 o0'=o3
+        1: o1 [r1 + 0.9*r2 + 0.9^2*r3 + 0.9^3*r4] d4 o1'=o4
+        2: o2 [r2 + 0.9*r3 + 0.9^2*r4] d4 o1'=o5
+        3: o3 [r3 + 0.9*r4] d4 o3'=o5
+        4: o4 r4 d4 o4'=o5
+    """
+    assert (
+        batch.is_single_trajectory()
+    ), "Unexpected terminated|truncated in middle of trajectory!"
+    len_ = len(batch)
+    # Shift NEXT_OBS, TERMINATEDS, and TRUNCATEDS.
+    batch[SampleBatch.NEXT_OBS] = np.concatenate(
+        [
+            batch[SampleBatch.OBS][n_step:],
+            np.stack([batch[SampleBatch.NEXT_OBS][-1]] * min(n_step, len_)),
+        ],
+        axis=0,
+    )
+    batch[SampleBatch.TERMINATEDS] = np.concatenate(
+        [
+            batch[SampleBatch.TERMINATEDS][n_step - 1 :],
+            np.tile(batch[SampleBatch.TERMINATEDS][-1], min(n_step - 1, len_)),
+        ],
+        axis=0,
+    )
+    # Only fix `truncateds`, if present in the batch.
+    if SampleBatch.TRUNCATEDS in batch:
+        batch[SampleBatch.TRUNCATEDS] = np.concatenate(
+            [
+                batch[SampleBatch.TRUNCATEDS][n_step - 1 :],
+                np.tile(batch[SampleBatch.TRUNCATEDS][-1], min(n_step - 1, len_)),
+            ],
+            axis=0,
+        )
+    # Change rewards in place.
+    for i in range(len_):
+        for j in range(1, n_step):
+            if i + j < len_:
+                batch[SampleBatch.REWARDS][i] += (
+                    gamma**j * batch[SampleBatch.REWARDS][i + j]
+                )
+@OldAPIStack
+def compute_advantages(
+    rollout: SampleBatch,
+    last_r: float,
+    gamma: float = 0.9,
+    lambda_: float = 1.0,
+    use_gae: bool = True,
+    use_critic: bool = True,
+    rewards: TensorType = None,
+    vf_preds: TensorType = None,
+):
+    """Given a rollout, compute its value targets and the advantages.
+    Args:
+        rollout: SampleBatch of a single trajectory.
+        last_r: Value estimation for last observation.
+        gamma: Discount factor.
+        lambda_: Parameter for GAE.
+        use_gae: Using Generalized Advantage Estimation.
+        use_critic: Whether to use critic (value estimates). Setting
+            this to False will use 0 as baseline.
+        rewards: Override the reward values in rollout.
+        vf_preds: Override the value function predictions in rollout.
+    Returns:
+        SampleBatch with experience from rollout and processed rewards.
+    """
+    assert (
+        SampleBatch.VF_PREDS in rollout or not use_critic
+    ), "use_critic=True but values not found"
+    assert use_critic or not use_gae, "Can't use gae without using a value function"
+    last_r = convert_to_numpy(last_r)
+    if rewards is None:
+        rewards = rollout[SampleBatch.REWARDS]
+    if vf_preds is None and use_critic:
+        vf_preds = rollout[SampleBatch.VF_PREDS]
+    if use_gae:
+        vpred_t = np.concatenate([vf_preds, np.array([last_r])])
+        delta_t = rewards + gamma * vpred_t[1:] - vpred_t[:-1]
+        # This formula for the advantage comes from:
+        # "Generalized Advantage Estimation": https://arxiv.org/abs/1506.02438
+        rollout[Postprocessing.ADVANTAGES] = discount_cumsum(delta_t, gamma * lambda_)
+        rollout[Postprocessing.VALUE_TARGETS] = (
+            rollout[Postprocessing.ADVANTAGES] + vf_preds
+        ).astype(np.float32)
+    else:
+        rewards_plus_v = np.concatenate([rewards, np.array([last_r])])
+        discounted_returns = discount_cumsum(rewards_plus_v, gamma)[:-1].astype(
+            np.float32
+        )
+        if use_critic:
+            rollout[Postprocessing.ADVANTAGES] = discounted_returns - vf_preds
+            rollout[Postprocessing.VALUE_TARGETS] = discounted_returns
+        else:
+            rollout[Postprocessing.ADVANTAGES] = discounted_returns
+            rollout[Postprocessing.VALUE_TARGETS] = np.zeros_like(
+                rollout[Postprocessing.ADVANTAGES]
+            )
+    rollout[Postprocessing.ADVANTAGES] = rollout[Postprocessing.ADVANTAGES].astype(
+        np.float32
+    )
+    return rollout
+@OldAPIStack
+def compute_gae_for_sample_batch(
+    policy: Policy,
+    sample_batch: SampleBatch,
+    other_agent_batches: Optional[Dict[AgentID, SampleBatch]] = None,
+    episode=None,
+) -> SampleBatch:
+    """Adds GAE (generalized advantage estimations) to a trajectory.
+    The trajectory contains only data from one episode and from one agent.
+    - If  `config.batch_mode=truncate_episodes` (default), sample_batch may
+    contain a truncated (at-the-end) episode, in case the
+    `config.rollout_fragment_length` was reached by the sampler.
+    - If `config.batch_mode=complete_episodes`, sample_batch will contain
+    exactly one episode (no matter how long).
+    New columns can be added to sample_batch and existing ones may be altered.
+    Args:
+        policy: The Policy used to generate the trajectory (`sample_batch`)
+        sample_batch: The SampleBatch to postprocess.
+        other_agent_batches: Optional dict of AgentIDs mapping to other
+            agents' trajectory data (from the same episode).
+            NOTE: The other agents use the same policy.
+        episode: Optional multi-agent episode object in which the agents
+            operated.
+    Returns:
+        The postprocessed, modified SampleBatch (or a new one).
+    """
+    # Compute the SampleBatch.VALUES_BOOTSTRAPPED column, which we'll need for the
+    # following `last_r` arg in `compute_advantages()`.
+    sample_batch = compute_bootstrap_value(sample_batch, policy)
+    vf_preds = np.array(sample_batch[SampleBatch.VF_PREDS])
+    rewards = np.array(sample_batch[SampleBatch.REWARDS])
+    # We need to squeeze out the time dimension if there is one
+    # Sanity check that both have the same shape
+    if len(vf_preds.shape) == 2:
+        assert vf_preds.shape == rewards.shape
+        vf_preds = np.squeeze(vf_preds, axis=1)
+        rewards = np.squeeze(rewards, axis=1)
+        squeezed = True
+    else:
+        squeezed = False
+    # Adds the policy logits, VF preds, and advantages to the batch,
+    # using GAE ("generalized advantage estimation") or not.
+    batch = compute_advantages(
+        rollout=sample_batch,
+        last_r=sample_batch[SampleBatch.VALUES_BOOTSTRAPPED][-1],
+        gamma=policy.config["gamma"],
+        lambda_=policy.config["lambda"],
+        use_gae=policy.config["use_gae"],
+        use_critic=policy.config.get("use_critic", True),
+        vf_preds=vf_preds,
+        rewards=rewards,
+    )
+    if squeezed:
+        # If we needed to squeeze rewards and vf_preds, we need to unsqueeze
+        # advantages again for it to have the same shape
+        batch[Postprocessing.ADVANTAGES] = np.expand_dims(
+            batch[Postprocessing.ADVANTAGES], axis=1
+        )
+    return batch
+@OldAPIStack
+def compute_bootstrap_value(sample_batch: SampleBatch, policy: Policy) -> SampleBatch:
+    """Performs a value function computation at the end of a trajectory.
+    If the trajectory is terminated (not truncated), will not use the value function,
+    but assume that the value of the last timestep is 0.0.
+    In all other cases, will use the given policy's value function to compute the
+    "bootstrapped" value estimate at the end of the given trajectory. To do so, the
+    very last observation (sample_batch[NEXT_OBS][-1]) and - if applicable -
+    the very last state output (sample_batch[STATE_OUT][-1]) wil be used as inputs to
+    the value function.
+    The thus computed value estimate will be stored in a new column of the
+    `sample_batch`: SampleBatch.VALUES_BOOTSTRAPPED. Thereby, values at all timesteps
+    in this column are set to 0.0, except or the last timestep, which receives the
+    computed bootstrapped value.
+    This is done, such that in any loss function (which processes raw, intact
+    trajectories, such as those of IMPALA and APPO) can use this new column as follows:
+    Example: numbers=ts in episode, '|'=episode boundary (terminal),
+    X=bootstrapped value (!= 0.0 b/c ts=12 is not a terminal).
+    ts=5 is NOT a terminal.
+    T:                     8   9  10  11  12 <- no terminal
+    VF_PREDS:              .   .   .   .   .
+    VALUES_BOOTSTRAPPED:   0   0   0   0   X
+    Args:
+        sample_batch: The SampleBatch (single trajectory) for which to compute the
+            bootstrap value at the end. This SampleBatch will be altered in place
+            (by adding a new column: SampleBatch.VALUES_BOOTSTRAPPED).
+        policy: The Policy object, whose value function to use.
+    Returns:
+         The altered SampleBatch (with the extra SampleBatch.VALUES_BOOTSTRAPPED
+         column).
+    """
+    # Trajectory is actually complete -> last r=0.0.
+    if sample_batch[SampleBatch.TERMINATEDS][-1]:
+        last_r = 0.0
+    # Trajectory has been truncated -> last r=VF estimate of last obs.
+    else:
+        # Input dict is provided to us automatically via the Model's
+        # requirements. It's a single-timestep (last one in trajectory)
+        # input_dict.
+        # Create an input dict according to the Policy's requirements.
+        input_dict = sample_batch.get_single_step_input_dict(
+            policy.view_requirements, index="last"
+        )
+        last_r = policy._value(**input_dict)
+    vf_preds = np.array(sample_batch[SampleBatch.VF_PREDS])
+    # We need to squeeze out the time dimension if there is one
+    if len(vf_preds.shape) == 2:
+        vf_preds = np.squeeze(vf_preds, axis=1)
+        squeezed = True
+    else:
+        squeezed = False
+    # Set the SampleBatch.VALUES_BOOTSTRAPPED field to VF_PREDS[1:] + the
+    # very last timestep (where this bootstrapping value is actually needed), which
+    # we set to the computed `last_r`.
+    sample_batch[SampleBatch.VALUES_BOOTSTRAPPED] = np.concatenate(
+        [
+            convert_to_numpy(vf_preds[1:]),
+            np.array([convert_to_numpy(last_r)], dtype=np.float32),
+        ],
+        axis=0,
+    )
+    if squeezed:
+        sample_batch[SampleBatch.VF_PREDS] = np.expand_dims(vf_preds, axis=1)
+        sample_batch[SampleBatch.VALUES_BOOTSTRAPPED] = np.expand_dims(
+            sample_batch[SampleBatch.VALUES_BOOTSTRAPPED], axis=1
+        )
+    return sample_batch
+@OldAPIStack
+def discount_cumsum(x: np.ndarray, gamma: float) -> np.ndarray:
+    """Calculates the discounted cumulative sum over a reward sequence `x`.
+    y[t] - discount*y[t+1] = x[t]
+    reversed(y)[t] - discount*reversed(y)[t-1] = reversed(x)[t]
+    Args:
+        gamma: The discount factor gamma.
+    Returns:
+        The sequence containing the discounted cumulative sums
+        for each individual reward in `x` till the end of the trajectory.
+     .. testcode::
+        :skipif: True
+        x = np.array([0.0, 1.0, 2.0, 3.0])
+        gamma = 0.9
+        discount_cumsum(x, gamma)
+    .. testoutput::
+        array([0.0 + 0.9*1.0 + 0.9^2*2.0 + 0.9^3*3.0,
+               1.0 + 0.9*2.0 + 0.9^2*3.0,
+               2.0 + 0.9*3.0,
+               3.0])
+    """
+    return scipy.signal.lfilter([1], [1, float(-gamma)], x[::-1], axis=0)[::-1]

.venv/lib/python3.11/site-packages/ray/rllib/evaluation/rollout_worker.py ADDED Viewed

	@@ -0,0 +1,2004 @@

+import copy
+import importlib.util
+import logging
+import os
+import platform
+import threading
+from collections import defaultdict
+from types import FunctionType
+from typing import (
+    TYPE_CHECKING,
+    Any,
+    Callable,
+    Collection,
+    Dict,
+    List,
+    Optional,
+    Set,
+    Tuple,
+    Type,
+    Union,
+)
+from gymnasium.spaces import Space
+import ray
+from ray import ObjectRef
+from ray import cloudpickle as pickle
+from ray.rllib.connectors.util import (
+    create_connectors_for_policy,
+    maybe_get_filters_for_syncing,
+)
+from ray.rllib.core.rl_module import validate_module_id
+from ray.rllib.core.rl_module.rl_module import RLModuleSpec
+from ray.rllib.env.base_env import BaseEnv, convert_to_base_env
+from ray.rllib.env.env_context import EnvContext
+from ray.rllib.env.env_runner import EnvRunner
+from ray.rllib.env.external_multi_agent_env import ExternalMultiAgentEnv
+from ray.rllib.env.multi_agent_env import MultiAgentEnv
+from ray.rllib.env.wrappers.atari_wrappers import is_atari, wrap_deepmind
+from ray.rllib.evaluation.metrics import RolloutMetrics
+from ray.rllib.evaluation.sampler import SyncSampler
+from ray.rllib.models import ModelCatalog
+from ray.rllib.models.preprocessors import Preprocessor
+from ray.rllib.offline import (
+    D4RLReader,
+    DatasetReader,
+    DatasetWriter,
+    InputReader,
+    IOContext,
+    JsonReader,
+    JsonWriter,
+    MixedInput,
+    NoopOutput,
+    OutputWriter,
+    ShuffledInput,
+)
+from ray.rllib.policy.policy import Policy, PolicySpec
+from ray.rllib.policy.policy_map import PolicyMap
+from ray.rllib.policy.sample_batch import (
+    DEFAULT_POLICY_ID,
+    MultiAgentBatch,
+    concat_samples,
+    convert_ma_batch_to_sample_batch,
+)
+from ray.rllib.policy.torch_policy import TorchPolicy
+from ray.rllib.policy.torch_policy_v2 import TorchPolicyV2
+from ray.rllib.utils import force_list
+from ray.rllib.utils.annotations import OldAPIStack, override
+from ray.rllib.utils.debug import summarize, update_global_seed_if_necessary
+from ray.rllib.utils.error import ERR_MSG_NO_GPUS, HOWTO_CHANGE_CONFIG
+from ray.rllib.utils.filter import Filter, NoFilter
+from ray.rllib.utils.framework import try_import_tf, try_import_torch
+from ray.rllib.utils.from_config import from_config
+from ray.rllib.utils.policy import create_policy_for_framework
+from ray.rllib.utils.sgd import do_minibatch_sgd
+from ray.rllib.utils.tf_run_builder import _TFRunBuilder
+from ray.rllib.utils.tf_utils import get_gpu_devices as get_tf_gpu_devices
+from ray.rllib.utils.tf_utils import get_tf_eager_cls_if_necessary
+from ray.rllib.utils.typing import (
+    AgentID,
+    EnvCreator,
+    EnvType,
+    ModelGradients,
+    ModelWeights,
+    MultiAgentPolicyConfigDict,
+    PartialAlgorithmConfigDict,
+    PolicyID,
+    PolicyState,
+    SampleBatchType,
+    T,
+)
+from ray.tune.registry import registry_contains_input, registry_get_input
+from ray.util.annotations import PublicAPI
+from ray.util.debug import disable_log_once_globally, enable_periodic_logging, log_once
+from ray.util.iter import ParallelIteratorWorker
+if TYPE_CHECKING:
+    from ray.rllib.algorithms.algorithm_config import AlgorithmConfig
+    from ray.rllib.callbacks.callbacks import RLlibCallback
+tf1, tf, tfv = try_import_tf()
+torch, _ = try_import_torch()
+logger = logging.getLogger(__name__)
+# Handle to the current rollout worker, which will be set to the most recently
+# created RolloutWorker in this process. This can be helpful to access in
+# custom env or policy classes for debugging or advanced use cases.
+_global_worker: Optional["RolloutWorker"] = None
+@OldAPIStack
+def get_global_worker() -> "RolloutWorker":
+    """Returns a handle to the active rollout worker in this process."""
+    global _global_worker
+    return _global_worker
+def _update_env_seed_if_necessary(
+    env: EnvType, seed: int, worker_idx: int, vector_idx: int
+):
+    """Set a deterministic random seed on environment.
+    NOTE: this may not work with remote environments (issue #18154).
+    """
+    if seed is None:
+        return
+    # A single RL job is unlikely to have more than 10K
+    # rollout workers.
+    max_num_envs_per_env_runner: int = 1000
+    assert (
+        worker_idx < max_num_envs_per_env_runner
+    ), "Too many envs per worker. Random seeds may collide."
+    computed_seed: int = worker_idx * max_num_envs_per_env_runner + vector_idx + seed
+    # Gymnasium.env.
+    # This will silently fail for most Farama-foundation gymnasium environments.
+    # (they do nothing and return None per default)
+    if not hasattr(env, "reset"):
+        if log_once("env_has_no_reset_method"):
+            logger.info(f"Env {env} doesn't have a `reset()` method. Cannot seed.")
+    else:
+        try:
+            env.reset(seed=computed_seed)
+        except Exception:
+            logger.info(
+                f"Env {env} doesn't support setting a seed via its `reset()` "
+                "method! Implement this method as `reset(self, *, seed=None, "
+                "options=None)` for it to abide to the correct API. Cannot seed."
+            )
+@OldAPIStack
+class RolloutWorker(ParallelIteratorWorker, EnvRunner):
+    """Common experience collection class.
+    This class wraps a policy instance and an environment class to
+    collect experiences from the environment. You can create many replicas of
+    this class as Ray actors to scale RL training.
+    This class supports vectorized and multi-agent policy evaluation (e.g.,
+    VectorEnv, MultiAgentEnv, etc.)
+    .. testcode::
+        :skipif: True
+        # Create a rollout worker and using it to collect experiences.
+        import gymnasium as gym
+        from ray.rllib.evaluation.rollout_worker import RolloutWorker
+        from ray.rllib.algorithms.ppo.ppo_tf_policy import PPOTF1Policy
+        worker = RolloutWorker(
+          env_creator=lambda _: gym.make("CartPole-v1"),
+          default_policy_class=PPOTF1Policy)
+        print(worker.sample())
+        # Creating a multi-agent rollout worker
+        from gymnasium.spaces import Discrete, Box
+        import random
+        MultiAgentTrafficGrid = ...
+        worker = RolloutWorker(
+          env_creator=lambda _: MultiAgentTrafficGrid(num_cars=25),
+          config=AlgorithmConfig().multi_agent(
+            policies={
+              # Use an ensemble of two policies for car agents
+              "car_policy1":
+                (PGTFPolicy, Box(...), Discrete(...),
+                 AlgorithmConfig.overrides(gamma=0.99)),
+              "car_policy2":
+                (PGTFPolicy, Box(...), Discrete(...),
+                 AlgorithmConfig.overrides(gamma=0.95)),
+              # Use a single shared policy for all traffic lights
+              "traffic_light_policy":
+                (PGTFPolicy, Box(...), Discrete(...), {}),
+            },
+            policy_mapping_fn=(
+              lambda agent_id, episode, **kwargs:
+              random.choice(["car_policy1", "car_policy2"])
+              if agent_id.startswith("car_") else "traffic_light_policy"),
+            ),
+        )
+        print(worker.sample())
+    .. testoutput::
+        SampleBatch({
+            "obs": [[...]], "actions": [[...]], "rewards": [[...]],
+            "terminateds": [[...]], "truncateds": [[...]], "new_obs": [[...]]}
+        )
+        MultiAgentBatch({
+            "car_policy1": SampleBatch(...),
+            "car_policy2": SampleBatch(...),
+            "traffic_light_policy": SampleBatch(...)}
+        )
+    """
+    def __init__(
+        self,
+        *,
+        env_creator: EnvCreator,
+        validate_env: Optional[Callable[[EnvType, EnvContext], None]] = None,
+        config: Optional["AlgorithmConfig"] = None,
+        worker_index: int = 0,
+        num_workers: Optional[int] = None,
+        recreated_worker: bool = False,
+        log_dir: Optional[str] = None,
+        spaces: Optional[Dict[PolicyID, Tuple[Space, Space]]] = None,
+        default_policy_class: Optional[Type[Policy]] = None,
+        dataset_shards: Optional[List[ray.data.Dataset]] = None,
+        **kwargs,
+    ):
+        """Initializes a RolloutWorker instance.
+        Args:
+            env_creator: Function that returns a gym.Env given an EnvContext
+                wrapped configuration.
+            validate_env: Optional callable to validate the generated
+                environment (only on worker=0).
+            worker_index: For remote workers, this should be set to a
+                non-zero and unique value. This index is passed to created envs
+                through EnvContext so that envs can be configured per worker.
+            recreated_worker: Whether this worker is a recreated one. Workers are
+                recreated by an Algorithm (via EnvRunnerGroup) in case
+                `restart_failed_env_runners=True` and one of the original workers (or
+                an already recreated one) has failed. They don't differ from original
+                workers other than the value of this flag (`self.recreated_worker`).
+            log_dir: Directory where logs can be placed.
+            spaces: An optional space dict mapping policy IDs
+                to (obs_space, action_space)-tuples. This is used in case no
+                Env is created on this RolloutWorker.
+        """
+        self._original_kwargs: dict = locals().copy()
+        del self._original_kwargs["self"]
+        global _global_worker
+        _global_worker = self
+        from ray.rllib.algorithms.algorithm_config import AlgorithmConfig
+        # Default config needed?
+        if config is None or isinstance(config, dict):
+            config = AlgorithmConfig().update_from_dict(config or {})
+        # Freeze config, so no one else can alter it from here on.
+        config.freeze()
+        # Set extra python env variables before calling super constructor.
+        if config.extra_python_environs_for_driver and worker_index == 0:
+            for key, value in config.extra_python_environs_for_driver.items():
+                os.environ[key] = str(value)
+        elif config.extra_python_environs_for_worker and worker_index > 0:
+            for key, value in config.extra_python_environs_for_worker.items():
+                os.environ[key] = str(value)
+        def gen_rollouts():
+            while True:
+                yield self.sample()
+        ParallelIteratorWorker.__init__(self, gen_rollouts, False)
+        EnvRunner.__init__(self, config=config)
+        self.num_workers = (
+            num_workers if num_workers is not None else self.config.num_env_runners
+        )
+        # In case we are reading from distributed datasets, store the shards here
+        # and pick our shard by our worker-index.
+        self._ds_shards = dataset_shards
+        self.worker_index: int = worker_index
+        # Lock to be able to lock this entire worker
+        # (via `self.lock()` and `self.unlock()`).
+        # This might be crucial to prevent a race condition in case
+        # `config.policy_states_are_swappable=True` and you are using an Algorithm
+        # with a learner thread. In this case, the thread might update a policy
+        # that is being swapped (during the update) by the Algorithm's
+        # training_step's `RolloutWorker.get_weights()` call (to sync back the
+        # new weights to all remote workers).
+        self._lock = threading.Lock()
+        if (
+            tf1
+            and (config.framework_str == "tf2" or config.enable_tf1_exec_eagerly)
+            # This eager check is necessary for certain all-framework tests
+            # that use tf's eager_mode() context generator.
+            and not tf1.executing_eagerly()
+        ):
+            tf1.enable_eager_execution()
+        if self.config.log_level:
+            logging.getLogger("ray.rllib").setLevel(self.config.log_level)
+        if self.worker_index > 1:
+            disable_log_once_globally()  # only need 1 worker to log
+        elif self.config.log_level == "DEBUG":
+            enable_periodic_logging()
+        env_context = EnvContext(
+            self.config.env_config,
+            worker_index=self.worker_index,
+            vector_index=0,
+            num_workers=self.num_workers,
+            remote=self.config.remote_worker_envs,
+            recreated_worker=recreated_worker,
+        )
+        self.env_context = env_context
+        self.config: AlgorithmConfig = config
+        self.callbacks: RLlibCallback = self.config.callbacks_class()
+        self.recreated_worker: bool = recreated_worker
+        # Setup current policy_mapping_fn. Start with the one from the config, which
+        # might be None in older checkpoints (nowadays AlgorithmConfig has a proper
+        # default for this); Need to cover this situation via the backup lambda here.
+        self.policy_mapping_fn = (
+            lambda agent_id, episode, worker, **kw: DEFAULT_POLICY_ID
+        )
+        self.set_policy_mapping_fn(self.config.policy_mapping_fn)
+        self.env_creator: EnvCreator = env_creator
+        # Resolve possible auto-fragment length.
+        configured_rollout_fragment_length = self.config.get_rollout_fragment_length(
+            worker_index=self.worker_index
+        )
+        self.total_rollout_fragment_length: int = (
+            configured_rollout_fragment_length * self.config.num_envs_per_env_runner
+        )
+        self.preprocessing_enabled: bool = not config._disable_preprocessor_api
+        self.last_batch: Optional[SampleBatchType] = None
+        self.global_vars: dict = {
+            # TODO(sven): Make this per-policy!
+            "timestep": 0,
+            # Counter for performed gradient updates per policy in `self.policy_map`.
+            # Allows for compiling metrics on the off-policy'ness of an update given
+            # that the number of gradient updates of the sampling policies are known
+            # to the learner (and can be compared to the learner version of the same
+            # policy).
+            "num_grad_updates_per_policy": defaultdict(int),
+        }
+        # If seed is provided, add worker index to it and 10k iff evaluation worker.
+        self.seed = (
+            None
+            if self.config.seed is None
+            else self.config.seed
+            + self.worker_index
+            + self.config.in_evaluation * 10000
+        )
+        # Update the global seed for numpy/random/tf-eager/torch if we are not
+        # the local worker, otherwise, this was already done in the Algorithm
+        # object itself.
+        if self.worker_index > 0:
+            update_global_seed_if_necessary(self.config.framework_str, self.seed)
+        # A single environment provided by the user (via config.env). This may
+        # also remain None.
+        # 1) Create the env using the user provided env_creator. This may
+        #    return a gym.Env (incl. MultiAgentEnv), an already vectorized
+        #    VectorEnv, BaseEnv, ExternalEnv, or an ActorHandle (remote env).
+        # 2) Wrap - if applicable - with Atari/rendering wrappers.
+        # 3) Seed the env, if necessary.
+        # 4) Vectorize the existing single env by creating more clones of
+        #    this env and wrapping it with the RLlib BaseEnv class.
+        self.env = self.make_sub_env_fn = None
+        # Create a (single) env for this worker.
+        if not (
+            self.worker_index == 0
+            and self.num_workers > 0
+            and not self.config.create_env_on_local_worker
+        ):
+            # Run the `env_creator` function passing the EnvContext.
+            self.env = env_creator(copy.deepcopy(self.env_context))
+        clip_rewards = self.config.clip_rewards
+        if self.env is not None:
+            # Custom validation function given, typically a function attribute of the
+            # Algorithm.
+            if validate_env is not None:
+                validate_env(self.env, self.env_context)
+            # We can't auto-wrap a BaseEnv.
+            if isinstance(self.env, (BaseEnv, ray.actor.ActorHandle)):
+                def wrap(env):
+                    return env
+            # Atari type env and "deepmind" preprocessor pref.
+            elif is_atari(self.env) and self.config.preprocessor_pref == "deepmind":
+                # Deepmind wrappers already handle all preprocessing.
+                self.preprocessing_enabled = False
+                # If clip_rewards not explicitly set to False, switch it
+                # on here (clip between -1.0 and 1.0).
+                if self.config.clip_rewards is None:
+                    clip_rewards = True
+                # Framestacking is used.
+                use_framestack = self.config.model.get("framestack") is True
+                def wrap(env):
+                    env = wrap_deepmind(
+                        env,
+                        dim=self.config.model.get("dim"),
+                        framestack=use_framestack,
+                        noframeskip=self.config.env_config.get("frameskip", 0) == 1,
+                    )
+                    return env
+            elif self.config.preprocessor_pref is None:
+                # Only turn off preprocessing
+                self.preprocessing_enabled = False
+                def wrap(env):
+                    return env
+            else:
+                def wrap(env):
+                    return env
+            # Wrap env through the correct wrapper.
+            self.env: EnvType = wrap(self.env)
+            # Ideally, we would use the same make_sub_env() function below
+            # to create self.env, but wrap(env) and self.env has a cyclic
+            # dependency on each other right now, so we would settle on
+            # duplicating the random seed setting logic for now.
+            _update_env_seed_if_necessary(self.env, self.seed, self.worker_index, 0)
+            # Call custom callback function `on_sub_environment_created`.
+            self.callbacks.on_sub_environment_created(
+                worker=self,
+                sub_environment=self.env,
+                env_context=self.env_context,
+            )
+            self.make_sub_env_fn = self._get_make_sub_env_fn(
+                env_creator, env_context, validate_env, wrap, self.seed
+            )
+        self.spaces = spaces
+        self.default_policy_class = default_policy_class
+        self.policy_dict, self.is_policy_to_train = self.config.get_multi_agent_setup(
+            env=self.env,
+            spaces=self.spaces,
+            default_policy_class=self.default_policy_class,
+        )
+        self.policy_map: Optional[PolicyMap] = None
+        # TODO(jungong) : clean up after non-connector env_runner is fully deprecated.
+        self.preprocessors: Dict[PolicyID, Preprocessor] = None
+        # Check available number of GPUs.
+        num_gpus = (
+            self.config.num_gpus
+            if self.worker_index == 0
+            else self.config.num_gpus_per_env_runner
+        )
+        # Error if we don't find enough GPUs.
+        if (
+            ray.is_initialized()
+            and ray._private.worker._mode() != ray._private.worker.LOCAL_MODE
+            and not config._fake_gpus
+        ):
+            devices = []
+            if self.config.framework_str in ["tf2", "tf"]:
+                devices = get_tf_gpu_devices()
+            elif self.config.framework_str == "torch":
+                devices = list(range(torch.cuda.device_count()))
+            if len(devices) < num_gpus:
+                raise RuntimeError(
+                    ERR_MSG_NO_GPUS.format(len(devices), devices) + HOWTO_CHANGE_CONFIG
+                )
+        # Warn, if running in local-mode and actual GPUs (not faked) are
+        # requested.
+        elif (
+            ray.is_initialized()
+            and ray._private.worker._mode() == ray._private.worker.LOCAL_MODE
+            and num_gpus > 0
+            and not self.config._fake_gpus
+        ):
+            logger.warning(
+                "You are running ray with `local_mode=True`, but have "
+                f"configured {num_gpus} GPUs to be used! In local mode, "
+                f"Policies are placed on the CPU and the `num_gpus` setting "
+                f"is ignored."
+            )
+        self.filters: Dict[PolicyID, Filter] = defaultdict(NoFilter)
+        # If RLModule API is enabled, multi_rl_module_spec holds the specs of the
+        # RLModules.
+        self.multi_rl_module_spec = None
+        self._update_policy_map(policy_dict=self.policy_dict)
+        # Update Policy's view requirements from Model, only if Policy directly
+        # inherited from base `Policy` class. At this point here, the Policy
+        # must have it's Model (if any) defined and ready to output an initial
+        # state.
+        for pol in self.policy_map.values():
+            if not pol._model_init_state_automatically_added:
+                pol._update_model_view_requirements_from_init_state()
+        if (
+            self.config.is_multi_agent
+            and self.env is not None
+            and not isinstance(
+                self.env,
+                (BaseEnv, ExternalMultiAgentEnv, MultiAgentEnv, ray.actor.ActorHandle),
+            )
+        ):
+            raise ValueError(
+                f"You are running a multi-agent setup, but the env {self.env} is not a "
+                f"subclass of BaseEnv, MultiAgentEnv, ActorHandle, or "
+                f"ExternalMultiAgentEnv!"
+            )
+        if self.worker_index == 0:
+            logger.info("Built filter map: {}".format(self.filters))
+        # This RolloutWorker has no env.
+        if self.env is None:
+            self.async_env = None
+        # Use a custom env-vectorizer and call it providing self.env.
+        elif "custom_vector_env" in self.config:
+            self.async_env = self.config.custom_vector_env(self.env)
+        # Default: Vectorize self.env via the make_sub_env function. This adds
+        # further clones of self.env and creates a RLlib BaseEnv (which is
+        # vectorized under the hood).
+        else:
+            # Always use vector env for consistency even if num_envs_per_env_runner=1.
+            self.async_env: BaseEnv = convert_to_base_env(
+                self.env,
+                make_env=self.make_sub_env_fn,
+                num_envs=self.config.num_envs_per_env_runner,
+                remote_envs=self.config.remote_worker_envs,
+                remote_env_batch_wait_ms=self.config.remote_env_batch_wait_ms,
+                worker=self,
+                restart_failed_sub_environments=(
+                    self.config.restart_failed_sub_environments
+                ),
+            )
+        # `truncate_episodes`: Allow a batch to contain more than one episode
+        # (fragments) and always make the batch `rollout_fragment_length`
+        # long.
+        rollout_fragment_length_for_sampler = configured_rollout_fragment_length
+        if self.config.batch_mode == "truncate_episodes":
+            pack = True
+        # `complete_episodes`: Never cut episodes and sampler will return
+        # exactly one (complete) episode per poll.
+        else:
+            assert self.config.batch_mode == "complete_episodes"
+            rollout_fragment_length_for_sampler = float("inf")
+            pack = False
+        # Create the IOContext for this worker.
+        self.io_context: IOContext = IOContext(
+            log_dir, self.config, self.worker_index, self
+        )
+        render = False
+        if self.config.render_env is True and (
+            self.num_workers == 0 or self.worker_index == 1
+        ):
+            render = True
+        if self.env is None:
+            self.sampler = None
+        else:
+            self.sampler = SyncSampler(
+                worker=self,
+                env=self.async_env,
+                clip_rewards=clip_rewards,
+                rollout_fragment_length=rollout_fragment_length_for_sampler,
+                count_steps_by=self.config.count_steps_by,
+                callbacks=self.callbacks,
+                multiple_episodes_in_batch=pack,
+                normalize_actions=self.config.normalize_actions,
+                clip_actions=self.config.clip_actions,
+                observation_fn=self.config.observation_fn,
+                sample_collector_class=self.config.sample_collector,
+                render=render,
+            )
+        self.input_reader: InputReader = self._get_input_creator_from_config()(
+            self.io_context
+        )
+        self.output_writer: OutputWriter = self._get_output_creator_from_config()(
+            self.io_context
+        )
+        # The current weights sequence number (version). May remain None for when
+        # not tracking weights versions.
+        self.weights_seq_no: Optional[int] = None
+    @override(EnvRunner)
+    def make_env(self):
+        # Override this method, b/c it's abstract and must be overridden.
+        # However, we see no point in implementing it for the old API stack any longer
+        # (the RolloutWorker class will be deprecated soon).
+        raise NotImplementedError
+    @override(EnvRunner)
+    def assert_healthy(self):
+        is_healthy = self.policy_map and self.input_reader and self.output_writer
+        assert is_healthy, (
+            f"RolloutWorker {self} (idx={self.worker_index}; "
+            f"num_workers={self.num_workers}) not healthy!"
+        )
+    @override(EnvRunner)
+    def sample(self, **kwargs) -> SampleBatchType:
+        """Returns a batch of experience sampled from this worker.
+        This method must be implemented by subclasses.
+        Returns:
+            A columnar batch of experiences (e.g., tensors) or a MultiAgentBatch.
+        .. testcode::
+            :skipif: True
+            import gymnasium as gym
+            from ray.rllib.evaluation.rollout_worker import RolloutWorker
+            from ray.rllib.algorithms.ppo.ppo_tf_policy import PPOTF1Policy
+            worker = RolloutWorker(
+              env_creator=lambda _: gym.make("CartPole-v1"),
+              default_policy_class=PPOTF1Policy,
+              config=AlgorithmConfig(),
+            )
+            print(worker.sample())
+        .. testoutput::
+            SampleBatch({"obs": [...], "action": [...], ...})
+        """
+        if self.config.fake_sampler and self.last_batch is not None:
+            return self.last_batch
+        elif self.input_reader is None:
+            raise ValueError(
+                "RolloutWorker has no `input_reader` object! "
+                "Cannot call `sample()`. You can try setting "
+                "`create_env_on_driver` to True."
+            )
+        if log_once("sample_start"):
+            logger.info(
+                "Generating sample batch of size {}".format(
+                    self.total_rollout_fragment_length
+                )
+            )
+        batches = [self.input_reader.next()]
+        steps_so_far = (
+            batches[0].count
+            if self.config.count_steps_by == "env_steps"
+            else batches[0].agent_steps()
+        )
+        # In truncate_episodes mode, never pull more than 1 batch per env.
+        # This avoids over-running the target batch size.
+        if (
+            self.config.batch_mode == "truncate_episodes"
+            and not self.config.offline_sampling
+        ):
+            max_batches = self.config.num_envs_per_env_runner
+        else:
+            max_batches = float("inf")
+        while steps_so_far < self.total_rollout_fragment_length and (
+            len(batches) < max_batches
+        ):
+            batch = self.input_reader.next()
+            steps_so_far += (
+                batch.count
+                if self.config.count_steps_by == "env_steps"
+                else batch.agent_steps()
+            )
+            batches.append(batch)
+        batch = concat_samples(batches)
+        self.callbacks.on_sample_end(worker=self, samples=batch)
+        # Always do writes prior to compression for consistency and to allow
+        # for better compression inside the writer.
+        self.output_writer.write(batch)
+        if log_once("sample_end"):
+            logger.info("Completed sample batch:\n\n{}\n".format(summarize(batch)))
+        if self.config.compress_observations:
+            batch.compress(bulk=self.config.compress_observations == "bulk")
+        if self.config.fake_sampler:
+            self.last_batch = batch
+        return batch
+    @override(EnvRunner)
+    def get_spaces(self) -> Dict[str, Tuple[Space, Space]]:
+        spaces = self.foreach_policy(
+            lambda p, pid: (pid, p.observation_space, p.action_space)
+        )
+        spaces = {e[0]: (getattr(e[1], "original_space", e[1]), e[2]) for e in spaces}
+        # Try to add the actual env's obs/action spaces.
+        env_spaces = self.foreach_env(
+            lambda env: (env.observation_space, env.action_space)
+        )
+        if env_spaces:
+            from ray.rllib.env import INPUT_ENV_SPACES
+            spaces[INPUT_ENV_SPACES] = env_spaces[0]
+        return spaces
+    @ray.method(num_returns=2)
+    def sample_with_count(self) -> Tuple[SampleBatchType, int]:
+        """Same as sample() but returns the count as a separate value.
+        Returns:
+            A columnar batch of experiences (e.g., tensors) and the
+                size of the collected batch.
+        .. testcode::
+            :skipif: True
+            import gymnasium as gym
+            from ray.rllib.evaluation.rollout_worker import RolloutWorker
+            from ray.rllib.algorithms.ppo.ppo_tf_policy import PPOTF1Policy
+            worker = RolloutWorker(
+              env_creator=lambda _: gym.make("CartPole-v1"),
+              default_policy_class=PPOTFPolicy)
+            print(worker.sample_with_count())
+        .. testoutput::
+            (SampleBatch({"obs": [...], "action": [...], ...}), 3)
+        """
+        batch = self.sample()
+        return batch, batch.count
+    def learn_on_batch(self, samples: SampleBatchType) -> Dict:
+        """Update policies based on the given batch.
+        This is the equivalent to apply_gradients(compute_gradients(samples)),
+        but can be optimized to avoid pulling gradients into CPU memory.
+        Args:
+            samples: The SampleBatch or MultiAgentBatch to learn on.
+        Returns:
+            Dictionary of extra metadata from compute_gradients().
+        .. testcode::
+            :skipif: True
+            import gymnasium as gym
+            from ray.rllib.evaluation.rollout_worker import RolloutWorker
+            from ray.rllib.algorithms.ppo.ppo_tf_policy import PPOTF1Policy
+            worker = RolloutWorker(
+              env_creator=lambda _: gym.make("CartPole-v1"),
+              default_policy_class=PPOTF1Policy)
+            batch = worker.sample()
+            info = worker.learn_on_batch(samples)
+        """
+        if log_once("learn_on_batch"):
+            logger.info(
+                "Training on concatenated sample batches:\n\n{}\n".format(
+                    summarize(samples)
+                )
+            )
+        info_out = {}
+        if isinstance(samples, MultiAgentBatch):
+            builders = {}
+            to_fetch = {}
+            for pid, batch in samples.policy_batches.items():
+                if self.is_policy_to_train is not None and not self.is_policy_to_train(
+                    pid, samples
+                ):
+                    continue
+                # Decompress SampleBatch, in case some columns are compressed.
+                batch.decompress_if_needed()
+                policy = self.policy_map[pid]
+                tf_session = policy.get_session()
+                if tf_session and hasattr(policy, "_build_learn_on_batch"):
+                    builders[pid] = _TFRunBuilder(tf_session, "learn_on_batch")
+                    to_fetch[pid] = policy._build_learn_on_batch(builders[pid], batch)
+                else:
+                    info_out[pid] = policy.learn_on_batch(batch)
+            info_out.update({pid: builders[pid].get(v) for pid, v in to_fetch.items()})
+        else:
+            if self.is_policy_to_train is None or self.is_policy_to_train(
+                DEFAULT_POLICY_ID, samples
+            ):
+                info_out.update(
+                    {
+                        DEFAULT_POLICY_ID: self.policy_map[
+                            DEFAULT_POLICY_ID
+                        ].learn_on_batch(samples)
+                    }
+                )
+        if log_once("learn_out"):
+            logger.debug("Training out:\n\n{}\n".format(summarize(info_out)))
+        return info_out
+    def sample_and_learn(
+        self,
+        expected_batch_size: int,
+        num_sgd_iter: int,
+        sgd_minibatch_size: str,
+        standardize_fields: List[str],
+    ) -> Tuple[dict, int]:
+        """Sample and batch and learn on it.
+        This is typically used in combination with distributed allreduce.
+        Args:
+            expected_batch_size: Expected number of samples to learn on.
+            num_sgd_iter: Number of SGD iterations.
+            sgd_minibatch_size: SGD minibatch size.
+            standardize_fields: List of sample fields to normalize.
+        Returns:
+            A tuple consisting of a dictionary of extra metadata returned from
+                the policies' `learn_on_batch()` and the number of samples
+                learned on.
+        """
+        batch = self.sample()
+        assert batch.count == expected_batch_size, (
+            "Batch size possibly out of sync between workers, expected:",
+            expected_batch_size,
+            "got:",
+            batch.count,
+        )
+        logger.info(
+            "Executing distributed minibatch SGD "
+            "with epoch size {}, minibatch size {}".format(
+                batch.count, sgd_minibatch_size
+            )
+        )
+        info = do_minibatch_sgd(
+            batch,
+            self.policy_map,
+            self,
+            num_sgd_iter,
+            sgd_minibatch_size,
+            standardize_fields,
+        )
+        return info, batch.count
+    def compute_gradients(
+        self,
+        samples: SampleBatchType,
+        single_agent: bool = None,
+    ) -> Tuple[ModelGradients, dict]:
+        """Returns a gradient computed w.r.t the specified samples.
+        Uses the Policy's/ies' compute_gradients method(s) to perform the
+        calculations. Skips policies that are not trainable as per
+        `self.is_policy_to_train()`.
+        Args:
+            samples: The SampleBatch or MultiAgentBatch to compute gradients
+                for using this worker's trainable policies.
+        Returns:
+            In the single-agent case, a tuple consisting of ModelGradients and
+            info dict of the worker's policy.
+            In the multi-agent case, a tuple consisting of a dict mapping
+            PolicyID to ModelGradients and a dict mapping PolicyID to extra
+            metadata info.
+            Note that the first return value (grads) can be applied as is to a
+            compatible worker using the worker's `apply_gradients()` method.
+        .. testcode::
+            :skipif: True
+            import gymnasium as gym
+            from ray.rllib.evaluation.rollout_worker import RolloutWorker
+            from ray.rllib.algorithms.ppo.ppo_tf_policy import PPOTF1Policy
+            worker = RolloutWorker(
+              env_creator=lambda _: gym.make("CartPole-v1"),
+              default_policy_class=PPOTF1Policy)
+            batch = worker.sample()
+            grads, info = worker.compute_gradients(samples)
+        """
+        if log_once("compute_gradients"):
+            logger.info("Compute gradients on:\n\n{}\n".format(summarize(samples)))
+        if single_agent is True:
+            samples = convert_ma_batch_to_sample_batch(samples)
+            grad_out, info_out = self.policy_map[DEFAULT_POLICY_ID].compute_gradients(
+                samples
+            )
+            info_out["batch_count"] = samples.count
+            return grad_out, info_out
+        # Treat everything as is multi-agent.
+        samples = samples.as_multi_agent()
+        # Calculate gradients for all policies.
+        grad_out, info_out = {}, {}
+        if self.config.framework_str == "tf":
+            for pid, batch in samples.policy_batches.items():
+                if self.is_policy_to_train is not None and not self.is_policy_to_train(
+                    pid, samples
+                ):
+                    continue
+                policy = self.policy_map[pid]
+                builder = _TFRunBuilder(policy.get_session(), "compute_gradients")
+                grad_out[pid], info_out[pid] = policy._build_compute_gradients(
+                    builder, batch
+                )
+            grad_out = {k: builder.get(v) for k, v in grad_out.items()}
+            info_out = {k: builder.get(v) for k, v in info_out.items()}
+        else:
+            for pid, batch in samples.policy_batches.items():
+                if self.is_policy_to_train is not None and not self.is_policy_to_train(
+                    pid, samples
+                ):
+                    continue
+                grad_out[pid], info_out[pid] = self.policy_map[pid].compute_gradients(
+                    batch
+                )
+        info_out["batch_count"] = samples.count
+        if log_once("grad_out"):
+            logger.info("Compute grad info:\n\n{}\n".format(summarize(info_out)))
+        return grad_out, info_out
+    def apply_gradients(
+        self,
+        grads: Union[ModelGradients, Dict[PolicyID, ModelGradients]],
+    ) -> None:
+        """Applies the given gradients to this worker's models.
+        Uses the Policy's/ies' apply_gradients method(s) to perform the
+        operations.
+        Args:
+            grads: Single ModelGradients (single-agent case) or a dict
+                mapping PolicyIDs to the respective model gradients
+                structs.
+        .. testcode::
+            :skipif: True
+            import gymnasium as gym
+            from ray.rllib.evaluation.rollout_worker import RolloutWorker
+            from ray.rllib.algorithms.ppo.ppo_tf_policy import PPOTF1Policy
+            worker = RolloutWorker(
+              env_creator=lambda _: gym.make("CartPole-v1"),
+              default_policy_class=PPOTF1Policy)
+            samples = worker.sample()
+            grads, info = worker.compute_gradients(samples)
+            worker.apply_gradients(grads)
+        """
+        if log_once("apply_gradients"):
+            logger.info("Apply gradients:\n\n{}\n".format(summarize(grads)))
+        # Grads is a dict (mapping PolicyIDs to ModelGradients).
+        # Multi-agent case.
+        if isinstance(grads, dict):
+            for pid, g in grads.items():
+                if self.is_policy_to_train is None or self.is_policy_to_train(
+                    pid, None
+                ):
+                    self.policy_map[pid].apply_gradients(g)
+        # Grads is a ModelGradients type. Single-agent case.
+        elif self.is_policy_to_train is None or self.is_policy_to_train(
+            DEFAULT_POLICY_ID, None
+        ):
+            self.policy_map[DEFAULT_POLICY_ID].apply_gradients(grads)
+    @override(EnvRunner)
+    def get_metrics(self) -> List[RolloutMetrics]:
+        """Returns the thus-far collected metrics from this worker's rollouts.
+        Returns:
+             List of RolloutMetrics collected thus-far.
+        """
+        # Get metrics from sampler (if any).
+        if self.sampler is not None:
+            out = self.sampler.get_metrics()
+        else:
+            out = []
+        return out
+    def foreach_env(self, func: Callable[[EnvType], T]) -> List[T]:
+        """Calls the given function with each sub-environment as arg.
+        Args:
+            func: The function to call for each underlying
+                sub-environment (as only arg).
+        Returns:
+             The list of return values of all calls to `func([env])`.
+        """
+        if self.async_env is None:
+            return []
+        envs = self.async_env.get_sub_environments()
+        # Empty list (not implemented): Call function directly on the
+        # BaseEnv.
+        if not envs:
+            return [func(self.async_env)]
+        # Call function on all underlying (vectorized) sub environments.
+        else:
+            return [func(e) for e in envs]
+    def foreach_env_with_context(
+        self, func: Callable[[EnvType, EnvContext], T]
+    ) -> List[T]:
+        """Calls given function with each sub-env plus env_ctx as args.
+        Args:
+            func: The function to call for each underlying
+                sub-environment and its EnvContext (as the args).
+        Returns:
+             The list of return values of all calls to `func([env, ctx])`.
+        """
+        if self.async_env is None:
+            return []
+        envs = self.async_env.get_sub_environments()
+        # Empty list (not implemented): Call function directly on the
+        # BaseEnv.
+        if not envs:
+            return [func(self.async_env, self.env_context)]
+        # Call function on all underlying (vectorized) sub environments.
+        else:
+            ret = []
+            for i, e in enumerate(envs):
+                ctx = self.env_context.copy_with_overrides(vector_index=i)
+                ret.append(func(e, ctx))
+            return ret
+    def get_policy(self, policy_id: PolicyID = DEFAULT_POLICY_ID) -> Optional[Policy]:
+        """Return policy for the specified id, or None.
+        Args:
+            policy_id: ID of the policy to return. None for DEFAULT_POLICY_ID
+                (in the single agent case).
+        Returns:
+            The policy under the given ID (or None if not found).
+        """
+        return self.policy_map.get(policy_id)
+    def add_policy(
+        self,
+        policy_id: PolicyID,
+        policy_cls: Optional[Type[Policy]] = None,
+        policy: Optional[Policy] = None,
+        *,
+        observation_space: Optional[Space] = None,
+        action_space: Optional[Space] = None,
+        config: Optional[PartialAlgorithmConfigDict] = None,
+        policy_state: Optional[PolicyState] = None,
+        policy_mapping_fn=None,
+        policies_to_train: Optional[
+            Union[Collection[PolicyID], Callable[[PolicyID, SampleBatchType], bool]]
+        ] = None,
+        module_spec: Optional[RLModuleSpec] = None,
+    ) -> Policy:
+        """Adds a new policy to this RolloutWorker.
+        Args:
+            policy_id: ID of the policy to add.
+            policy_cls: The Policy class to use for constructing the new Policy.
+                Note: Only one of `policy_cls` or `policy` must be provided.
+            policy: The Policy instance to add to this algorithm.
+                Note: Only one of `policy_cls` or `policy` must be provided.
+            observation_space: The observation space of the policy to add.
+            action_space: The action space of the policy to add.
+            config: The config overrides for the policy to add.
+            policy_state: Optional state dict to apply to the new
+                policy instance, right after its construction.
+            policy_mapping_fn: An optional (updated) policy mapping function
+                to use from here on. Note that already ongoing episodes will
+                not change their mapping but will use the old mapping till
+                the end of the episode.
+            policies_to_train: An optional collection of policy IDs to be
+                trained or a callable taking PolicyID and - optionally -
+                SampleBatchType and returning a bool (trainable or not?).
+                If None, will keep the existing setup in place.
+                Policies, whose IDs are not in the list (or for which the
+                callable returns False) will not be updated.
+            module_spec: In the new RLModule API we need to pass in the module_spec for
+                the new module that is supposed to be added. Knowing the policy spec is
+                not sufficient.
+        Returns:
+            The newly added policy.
+        Raises:
+            ValueError: If both `policy_cls` AND `policy` are provided.
+            KeyError: If the given `policy_id` already exists in this worker's
+                PolicyMap.
+        """
+        validate_module_id(policy_id, error=False)
+        if module_spec is not None:
+            raise ValueError(
+                "If you pass in module_spec to the policy, the RLModule API needs "
+                "to be enabled."
+            )
+        if policy_id in self.policy_map:
+            raise KeyError(
+                f"Policy ID '{policy_id}' already exists in policy map! "
+                "Make sure you use a Policy ID that has not been taken yet."
+                " Policy IDs that are already in your policy map: "
+                f"{list(self.policy_map.keys())}"
+            )
+        if (policy_cls is None) == (policy is None):
+            raise ValueError(
+                "Only one of `policy_cls` or `policy` must be provided to "
+                "RolloutWorker.add_policy()!"
+            )
+        if policy is None:
+            policy_dict_to_add, _ = self.config.get_multi_agent_setup(
+                policies={
+                    policy_id: PolicySpec(
+                        policy_cls, observation_space, action_space, config
+                    )
+                },
+                env=self.env,
+                spaces=self.spaces,
+                default_policy_class=self.default_policy_class,
+            )
+        else:
+            policy_dict_to_add = {
+                policy_id: PolicySpec(
+                    type(policy),
+                    policy.observation_space,
+                    policy.action_space,
+                    policy.config,
+                )
+            }
+        self.policy_dict.update(policy_dict_to_add)
+        self._update_policy_map(
+            policy_dict=policy_dict_to_add,
+            policy=policy,
+            policy_states={policy_id: policy_state},
+            single_agent_rl_module_spec=module_spec,
+        )
+        self.set_policy_mapping_fn(policy_mapping_fn)
+        if policies_to_train is not None:
+            self.set_is_policy_to_train(policies_to_train)
+        return self.policy_map[policy_id]
+    def remove_policy(
+        self,
+        *,
+        policy_id: PolicyID = DEFAULT_POLICY_ID,
+        policy_mapping_fn: Optional[Callable[[AgentID], PolicyID]] = None,
+        policies_to_train: Optional[
+            Union[Collection[PolicyID], Callable[[PolicyID, SampleBatchType], bool]]
+        ] = None,
+    ) -> None:
+        """Removes a policy from this RolloutWorker.
+        Args:
+            policy_id: ID of the policy to be removed. None for
+                DEFAULT_POLICY_ID.
+            policy_mapping_fn: An optional (updated) policy mapping function
+                to use from here on. Note that already ongoing episodes will
+                not change their mapping but will use the old mapping till
+                the end of the episode.
+            policies_to_train: An optional collection of policy IDs to be
+                trained or a callable taking PolicyID and - optionally -
+                SampleBatchType and returning a bool (trainable or not?).
+                If None, will keep the existing setup in place.
+                Policies, whose IDs are not in the list (or for which the
+                callable returns False) will not be updated.
+        """
+        if policy_id not in self.policy_map:
+            raise ValueError(f"Policy ID '{policy_id}' not in policy map!")
+        del self.policy_map[policy_id]
+        del self.preprocessors[policy_id]
+        self.set_policy_mapping_fn(policy_mapping_fn)
+        if policies_to_train is not None:
+            self.set_is_policy_to_train(policies_to_train)
+    def set_policy_mapping_fn(
+        self,
+        policy_mapping_fn: Optional[Callable[[AgentID, Any], PolicyID]] = None,
+    ) -> None:
+        """Sets `self.policy_mapping_fn` to a new callable (if provided).
+        Args:
+            policy_mapping_fn: The new mapping function to use. If None,
+                will keep the existing mapping function in place.
+        """
+        if policy_mapping_fn is not None:
+            self.policy_mapping_fn = policy_mapping_fn
+            if not callable(self.policy_mapping_fn):
+                raise ValueError("`policy_mapping_fn` must be a callable!")
+    def set_is_policy_to_train(
+        self,
+        is_policy_to_train: Union[
+            Collection[PolicyID], Callable[[PolicyID, Optional[SampleBatchType]], bool]
+        ],
+    ) -> None:
+        """Sets `self.is_policy_to_train()` to a new callable.
+        Args:
+            is_policy_to_train: A collection of policy IDs to be
+                trained or a callable taking PolicyID and - optionally -
+                SampleBatchType and returning a bool (trainable or not?).
+                If None, will keep the existing setup in place.
+                Policies, whose IDs are not in the list (or for which the
+                callable returns False) will not be updated.
+        """
+        # If collection given, construct a simple default callable returning True
+        # if the PolicyID is found in the list/set of IDs.
+        if not callable(is_policy_to_train):
+            assert isinstance(is_policy_to_train, (list, set, tuple)), (
+                "ERROR: `is_policy_to_train`must be a [list|set|tuple] or a "
+                "callable taking PolicyID and SampleBatch and returning "
+                "True|False (trainable or not?)."
+            )
+            pols = set(is_policy_to_train)
+            def is_policy_to_train(pid, batch=None):
+                return pid in pols
+        self.is_policy_to_train = is_policy_to_train
+    @PublicAPI(stability="alpha")
+    def get_policies_to_train(
+        self, batch: Optional[SampleBatchType] = None
+    ) -> Set[PolicyID]:
+        """Returns all policies-to-train, given an optional batch.
+        Loops through all policies currently in `self.policy_map` and checks
+        the return value of `self.is_policy_to_train(pid, batch)`.
+        Args:
+            batch: An optional SampleBatchType for the
+                `self.is_policy_to_train(pid, [batch]?)` check.
+        Returns:
+            The set of currently trainable policy IDs, given the optional
+            `batch`.
+        """
+        return {
+            pid
+            for pid in self.policy_map.keys()
+            if self.is_policy_to_train is None or self.is_policy_to_train(pid, batch)
+        }
+    def for_policy(
+        self,
+        func: Callable[[Policy, Optional[Any]], T],
+        policy_id: Optional[PolicyID] = DEFAULT_POLICY_ID,
+        **kwargs,
+    ) -> T:
+        """Calls the given function with the specified policy as first arg.
+        Args:
+            func: The function to call with the policy as first arg.
+            policy_id: The PolicyID of the policy to call the function with.
+        Keyword Args:
+            kwargs: Additional kwargs to be passed to the call.
+        Returns:
+            The return value of the function call.
+        """
+        return func(self.policy_map[policy_id], **kwargs)
+    def foreach_policy(
+        self, func: Callable[[Policy, PolicyID, Optional[Any]], T], **kwargs
+    ) -> List[T]:
+        """Calls the given function with each (policy, policy_id) tuple.
+        Args:
+            func: The function to call with each (policy, policy ID) tuple.
+        Keyword Args:
+            kwargs: Additional kwargs to be passed to the call.
+        Returns:
+             The list of return values of all calls to
+                `func([policy, pid, **kwargs])`.
+        """
+        return [func(policy, pid, **kwargs) for pid, policy in self.policy_map.items()]
+    def foreach_policy_to_train(
+        self, func: Callable[[Policy, PolicyID, Optional[Any]], T], **kwargs
+    ) -> List[T]:
+        """
+        Calls the given function with each (policy, policy_id) tuple.
+        Only those policies/IDs will be called on, for which
+        `self.is_policy_to_train()` returns True.
+        Args:
+            func: The function to call with each (policy, policy ID) tuple,
+                for only those policies that `self.is_policy_to_train`
+                returns True.
+        Keyword Args:
+            kwargs: Additional kwargs to be passed to the call.
+        Returns:
+            The list of return values of all calls to
+            `func([policy, pid, **kwargs])`.
+        """
+        return [
+            # Make sure to only iterate over keys() and not items(). Iterating over
+            # items will access policy_map elements even for pids that we do not need,
+            # i.e. those that are not in policy_to_train. Access to policy_map elements
+            # can cause disk access for policies that were offloaded to disk. Since
+            # these policies will be skipped in the for-loop accessing them is
+            # unnecessary, making subsequent disk access unnecessary.
+            func(self.policy_map[pid], pid, **kwargs)
+            for pid in self.policy_map.keys()
+            if self.is_policy_to_train is None or self.is_policy_to_train(pid, None)
+        ]
+    def sync_filters(self, new_filters: dict) -> None:
+        """Changes self's filter to given and rebases any accumulated delta.
+        Args:
+            new_filters: Filters with new state to update local copy.
+        """
+        assert all(k in new_filters for k in self.filters)
+        for k in self.filters:
+            self.filters[k].sync(new_filters[k])
+    def get_filters(self, flush_after: bool = False) -> Dict:
+        """Returns a snapshot of filters.
+        Args:
+            flush_after: Clears the filter buffer state.
+        Returns:
+            Dict for serializable filters
+        """
+        return_filters = {}
+        for k, f in self.filters.items():
+            return_filters[k] = f.as_serializable()
+            if flush_after:
+                f.reset_buffer()
+        return return_filters
+    def get_state(self) -> dict:
+        filters = self.get_filters(flush_after=True)
+        policy_states = {}
+        for pid in self.policy_map.keys():
+            # If required by the user, only capture policies that are actually
+            # trainable. Otherwise, capture all policies (for saving to disk).
+            if (
+                not self.config.checkpoint_trainable_policies_only
+                or self.is_policy_to_train is None
+                or self.is_policy_to_train(pid)
+            ):
+                policy_states[pid] = self.policy_map[pid].get_state()
+        return {
+            # List all known policy IDs here for convenience. When an Algorithm gets
+            # restored from a checkpoint, it will not have access to the list of
+            # possible IDs as each policy is stored in its own sub-dir
+            # (see "policy_states").
+            "policy_ids": list(self.policy_map.keys()),
+            # Note that this field will not be stored in the algorithm checkpoint's
+            # state file, but each policy will get its own state file generated in
+            # a sub-dir within the algo's checkpoint dir.
+            "policy_states": policy_states,
+            # Also store current mapping fn and which policies to train.
+            "policy_mapping_fn": self.policy_mapping_fn,
+            "is_policy_to_train": self.is_policy_to_train,
+            # TODO: Filters will be replaced by connectors.
+            "filters": filters,
+        }
+    def set_state(self, state: dict) -> None:
+        # Backward compatibility (old checkpoints' states would have the local
+        # worker state as a bytes object, not a dict).
+        if isinstance(state, bytes):
+            state = pickle.loads(state)
+        # TODO: Once filters are handled by connectors, get rid of the "filters"
+        #  key in `state` entirely (will be part of the policies then).
+        self.sync_filters(state["filters"])
+        # Support older checkpoint versions (< 1.0), in which the policy_map
+        # was stored under the "state" key, not "policy_states".
+        policy_states = (
+            state["policy_states"] if "policy_states" in state else state["state"]
+        )
+        for pid, policy_state in policy_states.items():
+            # If - for some reason - we have an invalid PolicyID in the state,
+            # this might be from an older checkpoint (pre v1.0). Just warn here.
+            validate_module_id(pid, error=False)
+            if pid not in self.policy_map:
+                spec = policy_state.get("policy_spec", None)
+                if spec is None:
+                    logger.warning(
+                        f"PolicyID '{pid}' was probably added on-the-fly (not"
+                        " part of the static `multagent.policies` config) and"
+                        " no PolicySpec objects found in the pickled policy "
+                        f"state. Will not add `{pid}`, but ignore it for now."
+                    )
+                else:
+                    policy_spec = (
+                        PolicySpec.deserialize(spec) if isinstance(spec, dict) else spec
+                    )
+                    self.add_policy(
+                        policy_id=pid,
+                        policy_cls=policy_spec.policy_class,
+                        observation_space=policy_spec.observation_space,
+                        action_space=policy_spec.action_space,
+                        config=policy_spec.config,
+                    )
+            if pid in self.policy_map:
+                self.policy_map[pid].set_state(policy_state)
+        # Also restore mapping fn and which policies to train.
+        if "policy_mapping_fn" in state:
+            self.set_policy_mapping_fn(state["policy_mapping_fn"])
+        if state.get("is_policy_to_train") is not None:
+            self.set_is_policy_to_train(state["is_policy_to_train"])
+    def get_weights(
+        self,
+        policies: Optional[Collection[PolicyID]] = None,
+        inference_only: bool = False,
+    ) -> Dict[PolicyID, ModelWeights]:
+        """Returns each policies' model weights of this worker.
+        Args:
+            policies: List of PolicyIDs to get the weights from.
+                Use None for all policies.
+            inference_only: This argument is only added for interface
+                consistency with the new api stack.
+        Returns:
+            Dict mapping PolicyIDs to ModelWeights.
+        .. testcode::
+            :skipif: True
+            from ray.rllib.evaluation.rollout_worker import RolloutWorker
+            # Create a RolloutWorker.
+            worker = ...
+            weights = worker.get_weights()
+            print(weights)
+        .. testoutput::
+            {"default_policy": {"layer1": array(...), "layer2": ...}}
+        """
+        if policies is None:
+            policies = list(self.policy_map.keys())
+        policies = force_list(policies)
+        return {
+            # Make sure to only iterate over keys() and not items(). Iterating over
+            # items will access policy_map elements even for pids that we do not need,
+            # i.e. those that are not in policies. Access to policy_map elements can
+            # cause disk access for policies that were offloaded to disk. Since these
+            # policies will be skipped in the for-loop accessing them is unnecessary,
+            # making subsequent disk access unnecessary.
+            pid: self.policy_map[pid].get_weights()
+            for pid in self.policy_map.keys()
+            if pid in policies
+        }
+    def set_weights(
+        self,
+        weights: Dict[PolicyID, ModelWeights],
+        global_vars: Optional[Dict] = None,
+        weights_seq_no: Optional[int] = None,
+    ) -> None:
+        """Sets each policies' model weights of this worker.
+        Args:
+            weights: Dict mapping PolicyIDs to the new weights to be used.
+            global_vars: An optional global vars dict to set this
+                worker to. If None, do not update the global_vars.
+            weights_seq_no: If needed, a sequence number for the weights version
+                can be passed into this method. If not None, will store this seq no
+                (in self.weights_seq_no) and in future calls - if the seq no did not
+                change wrt. the last call - will ignore the call to save on performance.
+        .. testcode::
+            :skipif: True
+            from ray.rllib.evaluation.rollout_worker import RolloutWorker
+            # Create a RolloutWorker.
+            worker = ...
+            weights = worker.get_weights()
+            # Set `global_vars` (timestep) as well.
+            worker.set_weights(weights, {"timestep": 42})
+        """
+        # Only update our weights, if no seq no given OR given seq no is different
+        # from ours.
+        if weights_seq_no is None or weights_seq_no != self.weights_seq_no:
+            # If per-policy weights are object refs, `ray.get()` them first.
+            if weights and isinstance(next(iter(weights.values())), ObjectRef):
+                actual_weights = ray.get(list(weights.values()))
+                weights = {
+                    pid: actual_weights[i] for i, pid in enumerate(weights.keys())
+                }
+            for pid, w in weights.items():
+                if pid in self.policy_map:
+                    self.policy_map[pid].set_weights(w)
+                elif log_once("set_weights_on_non_existent_policy"):
+                    logger.warning(
+                        "`RolloutWorker.set_weights()` used with weights from "
+                        f"policyID={pid}, but this policy cannot be found on this "
+                        f"worker! Skipping ..."
+                    )
+        self.weights_seq_no = weights_seq_no
+        if global_vars:
+            self.set_global_vars(global_vars)
+    def get_global_vars(self) -> dict:
+        """Returns the current `self.global_vars` dict of this RolloutWorker.
+        Returns:
+            The current `self.global_vars` dict of this RolloutWorker.
+        .. testcode::
+            :skipif: True
+            from ray.rllib.evaluation.rollout_worker import RolloutWorker
+            # Create a RolloutWorker.
+            worker = ...
+            global_vars = worker.get_global_vars()
+            print(global_vars)
+        .. testoutput::
+            {"timestep": 424242}
+        """
+        return self.global_vars
+    def set_global_vars(
+        self,
+        global_vars: dict,
+        policy_ids: Optional[List[PolicyID]] = None,
+    ) -> None:
+        """Updates this worker's and all its policies' global vars.
+        Updates are done using the dict's update method.
+        Args:
+            global_vars: The global_vars dict to update the `self.global_vars` dict
+                from.
+            policy_ids: Optional list of Policy IDs to update. If None, will update all
+                policies on the to-be-updated workers.
+        .. testcode::
+            :skipif: True
+            worker = ...
+            global_vars = worker.set_global_vars(
+            ...     {"timestep": 4242})
+        """
+        # Handle per-policy values.
+        global_vars_copy = global_vars.copy()
+        gradient_updates_per_policy = global_vars_copy.pop(
+            "num_grad_updates_per_policy", {}
+        )
+        self.global_vars["num_grad_updates_per_policy"].update(
+            gradient_updates_per_policy
+        )
+        # Only update explicitly provided policies or those that that are being
+        # trained, in order to avoid superfluous access of policies, which might have
+        # been offloaded to the object store.
+        # Important b/c global vars are constantly being updated.
+        for pid in policy_ids if policy_ids is not None else self.policy_map.keys():
+            if self.is_policy_to_train is None or self.is_policy_to_train(pid, None):
+                self.policy_map[pid].on_global_var_update(
+                    dict(
+                        global_vars_copy,
+                        # If count is None, Policy won't update the counter.
+                        **{"num_grad_updates": gradient_updates_per_policy.get(pid)},
+                    )
+                )
+        # Update all other global vars.
+        self.global_vars.update(global_vars_copy)
+    @override(EnvRunner)
+    def stop(self) -> None:
+        """Releases all resources used by this RolloutWorker."""
+        # If we have an env -> Release its resources.
+        if self.env is not None:
+            self.async_env.stop()
+        # Close all policies' sessions (if tf static graph).
+        for policy in self.policy_map.cache.values():
+            sess = policy.get_session()
+            # Closes the tf session, if any.
+            if sess is not None:
+                sess.close()
+    def lock(self) -> None:
+        """Locks this RolloutWorker via its own threading.Lock."""
+        self._lock.acquire()
+    def unlock(self) -> None:
+        """Unlocks this RolloutWorker via its own threading.Lock."""
+        self._lock.release()
+    def setup_torch_data_parallel(
+        self, url: str, world_rank: int, world_size: int, backend: str
+    ) -> None:
+        """Join a torch process group for distributed SGD."""
+        logger.info(
+            "Joining process group, url={}, world_rank={}, "
+            "world_size={}, backend={}".format(url, world_rank, world_size, backend)
+        )
+        torch.distributed.init_process_group(
+            backend=backend, init_method=url, rank=world_rank, world_size=world_size
+        )
+        for pid, policy in self.policy_map.items():
+            if not isinstance(policy, (TorchPolicy, TorchPolicyV2)):
+                raise ValueError(
+                    "This policy does not support torch distributed", policy
+                )
+            policy.distributed_world_size = world_size
+    def creation_args(self) -> dict:
+        """Returns the kwargs dict used to create this worker."""
+        return self._original_kwargs
+    def get_host(self) -> str:
+        """Returns the hostname of the process running this evaluator."""
+        return platform.node()
+    def get_node_ip(self) -> str:
+        """Returns the IP address of the node that this worker runs on."""
+        return ray.util.get_node_ip_address()
+    def find_free_port(self) -> int:
+        """Finds a free port on the node that this worker runs on."""
+        from ray.air._internal.util import find_free_port
+        return find_free_port()
+    def _update_policy_map(
+        self,
+        *,
+        policy_dict: MultiAgentPolicyConfigDict,
+        policy: Optional[Policy] = None,
+        policy_states: Optional[Dict[PolicyID, PolicyState]] = None,
+        single_agent_rl_module_spec: Optional[RLModuleSpec] = None,
+    ) -> None:
+        """Updates the policy map (and other stuff) on this worker.
+        It performs the following:
+            1. It updates the observation preprocessors and updates the policy_specs
+                with the postprocessed observation_spaces.
+            2. It updates the policy_specs with the complete algorithm_config (merged
+                with the policy_spec's config).
+            3. If needed it will update the self.multi_rl_module_spec on this worker
+            3. It updates the policy map with the new policies
+            4. It updates the filter dict
+            5. It calls the on_create_policy() hook of the callbacks on the newly added
+                policies.
+        Args:
+            policy_dict: The policy dict to update the policy map with.
+            policy: The policy to update the policy map with.
+            policy_states: The policy states to update the policy map with.
+            single_agent_rl_module_spec: The RLModuleSpec to add to the
+                MultiRLModuleSpec. If None, the config's
+                `get_default_rl_module_spec` method's output will be used to create
+                the policy with.
+        """
+        # Update the input policy dict with the postprocessed observation spaces and
+        # merge configs. Also updates the preprocessor dict.
+        updated_policy_dict = self._get_complete_policy_specs_dict(policy_dict)
+        # Builds the self.policy_map dict
+        self._build_policy_map(
+            policy_dict=updated_policy_dict,
+            policy=policy,
+            policy_states=policy_states,
+        )
+        # Initialize the filter dict
+        self._update_filter_dict(updated_policy_dict)
+        # Call callback policy init hooks (only if the added policy did not exist
+        # before).
+        if policy is None:
+            self._call_callbacks_on_create_policy()
+        if self.worker_index == 0:
+            logger.info(f"Built policy map: {self.policy_map}")
+            logger.info(f"Built preprocessor map: {self.preprocessors}")
+    def _get_complete_policy_specs_dict(
+        self, policy_dict: MultiAgentPolicyConfigDict
+    ) -> MultiAgentPolicyConfigDict:
+        """Processes the policy dict and creates a new copy with the processed attrs.
+        This processes the observation_space and prepares them for passing to rl module
+        construction. It also merges the policy configs with the algorithm config.
+        During this processing, we will also construct the preprocessors dict.
+        """
+        from ray.rllib.algorithms.algorithm_config import AlgorithmConfig
+        updated_policy_dict = copy.deepcopy(policy_dict)
+        # If our preprocessors dict does not exist yet, create it here.
+        self.preprocessors = self.preprocessors or {}
+        # Loop through given policy-dict and add each entry to our map.
+        for name, policy_spec in sorted(updated_policy_dict.items()):
+            logger.debug("Creating policy for {}".format(name))
+            # Policy brings its own complete AlgorithmConfig -> Use it for this policy.
+            if isinstance(policy_spec.config, AlgorithmConfig):
+                merged_conf = policy_spec.config
+            else:
+                # Update the general config with the specific config
+                # for this particular policy.
+                merged_conf: "AlgorithmConfig" = self.config.copy(copy_frozen=False)
+                merged_conf.update_from_dict(policy_spec.config or {})
+            # Update num_workers and worker_index.
+            merged_conf.worker_index = self.worker_index
+            # Preprocessors.
+            obs_space = policy_spec.observation_space
+            # Initialize preprocessor for this policy to None.
+            self.preprocessors[name] = None
+            if self.preprocessing_enabled:
+                # Policies should deal with preprocessed (automatically flattened)
+                # observations if preprocessing is enabled.
+                preprocessor = ModelCatalog.get_preprocessor_for_space(
+                    obs_space,
+                    merged_conf.model,
+                    include_multi_binary=False,
+                )
+                # Original observation space should be accessible at
+                # obs_space.original_space after this step.
+                if preprocessor is not None:
+                    obs_space = preprocessor.observation_space
+            policy_spec.config = merged_conf
+            policy_spec.observation_space = obs_space
+        return updated_policy_dict
+    def _update_policy_dict_with_multi_rl_module(
+        self, policy_dict: MultiAgentPolicyConfigDict
+    ) -> MultiAgentPolicyConfigDict:
+        for name, policy_spec in policy_dict.items():
+            policy_spec.config["__multi_rl_module_spec"] = self.multi_rl_module_spec
+        return policy_dict
+    def _build_policy_map(
+        self,
+        *,
+        policy_dict: MultiAgentPolicyConfigDict,
+        policy: Optional[Policy] = None,
+        policy_states: Optional[Dict[PolicyID, PolicyState]] = None,
+    ) -> None:
+        """Adds the given policy_dict to `self.policy_map`.
+        Args:
+            policy_dict: The MultiAgentPolicyConfigDict to be added to this
+                worker's PolicyMap.
+            policy: If the policy to add already exists, user can provide it here.
+            policy_states: Optional dict from PolicyIDs to PolicyStates to
+                restore the states of the policies being built.
+        """
+        # If our policy_map does not exist yet, create it here.
+        self.policy_map = self.policy_map or PolicyMap(
+            capacity=self.config.policy_map_capacity,
+            policy_states_are_swappable=self.config.policy_states_are_swappable,
+        )
+        # Loop through given policy-dict and add each entry to our map.
+        for name, policy_spec in sorted(policy_dict.items()):
+            # Create the actual policy object.
+            if policy is None:
+                new_policy = create_policy_for_framework(
+                    policy_id=name,
+                    policy_class=get_tf_eager_cls_if_necessary(
+                        policy_spec.policy_class, policy_spec.config
+                    ),
+                    merged_config=policy_spec.config,
+                    observation_space=policy_spec.observation_space,
+                    action_space=policy_spec.action_space,
+                    worker_index=self.worker_index,
+                    seed=self.seed,
+                )
+            else:
+                new_policy = policy
+            self.policy_map[name] = new_policy
+            restore_states = (policy_states or {}).get(name, None)
+            # Set the state of the newly created policy before syncing filters, etc.
+            if restore_states:
+                new_policy.set_state(restore_states)
+    def _update_filter_dict(self, policy_dict: MultiAgentPolicyConfigDict) -> None:
+        """Updates the filter dict for the given policy_dict."""
+        for name, policy_spec in sorted(policy_dict.items()):
+            new_policy = self.policy_map[name]
+            # Note(jungong) : We should only create new connectors for the
+            # policy iff we are creating a new policy from scratch. i.e,
+            # we should NOT create new connectors when we already have the
+            # policy object created before this function call or have the
+            # restoring states from the caller.
+            # Also note that we cannot just check the existence of connectors
+            # to decide whether we should create connectors because we may be
+            # restoring a policy that has 0 connectors configured.
+            if (
+                new_policy.agent_connectors is None
+                or new_policy.action_connectors is None
+            ):
+                # TODO(jungong) : revisit this. It will be nicer to create
+                # connectors as the last step of Policy.__init__().
+                create_connectors_for_policy(new_policy, policy_spec.config)
+            maybe_get_filters_for_syncing(self, name)
+    def _call_callbacks_on_create_policy(self):
+        """Calls the on_create_policy callback for each policy in the policy map."""
+        for name, policy in self.policy_map.items():
+            self.callbacks.on_create_policy(policy_id=name, policy=policy)
+    def _get_input_creator_from_config(self):
+        def valid_module(class_path):
+            if (
+                isinstance(class_path, str)
+                and not os.path.isfile(class_path)
+                and "." in class_path
+            ):
+                module_path, class_name = class_path.rsplit(".", 1)
+                try:
+                    spec = importlib.util.find_spec(module_path)
+                    if spec is not None:
+                        return True
+                except (ModuleNotFoundError, ValueError):
+                    print(
+                        f"module {module_path} not found while trying to get "
+                        f"input {class_path}"
+                    )
+            return False
+        # A callable returning an InputReader object to use.
+        if isinstance(self.config.input_, FunctionType):
+            return self.config.input_
+        # Use RLlib's Sampler classes (SyncSampler).
+        elif self.config.input_ == "sampler":
+            return lambda ioctx: ioctx.default_sampler_input()
+        # Ray Dataset input -> Use `config.input_config` to construct DatasetReader.
+        elif self.config.input_ == "dataset":
+            assert self._ds_shards is not None
+            # Input dataset shards should have already been prepared.
+            # We just need to take the proper shard here.
+            return lambda ioctx: DatasetReader(
+                self._ds_shards[self.worker_index], ioctx
+            )
+        # Dict: Mix of different input methods with different ratios.
+        elif isinstance(self.config.input_, dict):
+            return lambda ioctx: ShuffledInput(
+                MixedInput(self.config.input_, ioctx), self.config.shuffle_buffer_size
+            )
+        # A pre-registered input descriptor (str).
+        elif isinstance(self.config.input_, str) and registry_contains_input(
+            self.config.input_
+        ):
+            return registry_get_input(self.config.input_)
+        # D4RL input.
+        elif "d4rl" in self.config.input_:
+            env_name = self.config.input_.split(".")[-1]
+            return lambda ioctx: D4RLReader(env_name, ioctx)
+        # Valid python module (class path) -> Create using `from_config`.
+        elif valid_module(self.config.input_):
+            return lambda ioctx: ShuffledInput(
+                from_config(self.config.input_, ioctx=ioctx)
+            )
+        # JSON file or list of JSON files -> Use JsonReader (shuffled).
+        else:
+            return lambda ioctx: ShuffledInput(
+                JsonReader(self.config.input_, ioctx), self.config.shuffle_buffer_size
+            )
+    def _get_output_creator_from_config(self):
+        if isinstance(self.config.output, FunctionType):
+            return self.config.output
+        elif self.config.output is None:
+            return lambda ioctx: NoopOutput()
+        elif self.config.output == "dataset":
+            return lambda ioctx: DatasetWriter(
+                ioctx, compress_columns=self.config.output_compress_columns
+            )
+        elif self.config.output == "logdir":
+            return lambda ioctx: JsonWriter(
+                ioctx.log_dir,
+                ioctx,
+                max_file_size=self.config.output_max_file_size,
+                compress_columns=self.config.output_compress_columns,
+            )
+        else:
+            return lambda ioctx: JsonWriter(
+                self.config.output,
+                ioctx,
+                max_file_size=self.config.output_max_file_size,
+                compress_columns=self.config.output_compress_columns,
+            )
+    def _get_make_sub_env_fn(
+        self, env_creator, env_context, validate_env, env_wrapper, seed
+    ):
+        def _make_sub_env_local(vector_index):
+            # Used to created additional environments during environment
+            # vectorization.
+            # Create the env context (config dict + meta-data) for
+            # this particular sub-env within the vectorized one.
+            env_ctx = env_context.copy_with_overrides(vector_index=vector_index)
+            # Create the sub-env.
+            env = env_creator(env_ctx)
+            # Custom validation function given by user.
+            if validate_env is not None:
+                validate_env(env, env_ctx)
+            # Use our wrapper, defined above.
+            env = env_wrapper(env)
+            # Make sure a deterministic random seed is set on
+            # all the sub-environments if specified.
+            _update_env_seed_if_necessary(
+                env, seed, env_context.worker_index, vector_index
+            )
+            return env
+        if not env_context.remote:
+            def _make_sub_env_remote(vector_index):
+                sub_env = _make_sub_env_local(vector_index)
+                self.callbacks.on_sub_environment_created(
+                    worker=self,
+                    sub_environment=sub_env,
+                    env_context=env_context.copy_with_overrides(
+                        worker_index=env_context.worker_index,
+                        vector_index=vector_index,
+                        remote=False,
+                    ),
+                )
+                return sub_env
+            return _make_sub_env_remote
+        else:
+            return _make_sub_env_local

.venv/lib/python3.11/site-packages/ray/rllib/evaluation/sample_batch_builder.py ADDED Viewed

	@@ -0,0 +1,264 @@

+import collections
+import logging
+import numpy as np
+from typing import List, Any, Dict, TYPE_CHECKING
+from ray.rllib.env.base_env import _DUMMY_AGENT_ID
+from ray.rllib.policy.policy import Policy
+from ray.rllib.policy.sample_batch import SampleBatch, MultiAgentBatch
+from ray.rllib.utils.annotations import OldAPIStack
+from ray.rllib.utils.debug import summarize
+from ray.rllib.utils.deprecation import deprecation_warning
+from ray.rllib.utils.typing import PolicyID, AgentID
+from ray.util.debug import log_once
+if TYPE_CHECKING:
+    from ray.rllib.callbacks.callbacks import RLlibCallback
+logger = logging.getLogger(__name__)
+def _to_float_array(v: List[Any]) -> np.ndarray:
+    arr = np.array(v)
+    if arr.dtype == np.float64:
+        return arr.astype(np.float32)  # save some memory
+    return arr
+@OldAPIStack
+class SampleBatchBuilder:
+    """Util to build a SampleBatch incrementally.
+    For efficiency, SampleBatches hold values in column form (as arrays).
+    However, it is useful to add data one row (dict) at a time.
+    """
+    _next_unroll_id = 0  # disambiguates unrolls within a single episode
+    def __init__(self):
+        self.buffers: Dict[str, List] = collections.defaultdict(list)
+        self.count = 0
+    def add_values(self, **values: Any) -> None:
+        """Add the given dictionary (row) of values to this batch."""
+        for k, v in values.items():
+            self.buffers[k].append(v)
+        self.count += 1
+    def add_batch(self, batch: SampleBatch) -> None:
+        """Add the given batch of values to this batch."""
+        for k, column in batch.items():
+            self.buffers[k].extend(column)
+        self.count += batch.count
+    def build_and_reset(self) -> SampleBatch:
+        """Returns a sample batch including all previously added values."""
+        batch = SampleBatch({k: _to_float_array(v) for k, v in self.buffers.items()})
+        if SampleBatch.UNROLL_ID not in batch:
+            batch[SampleBatch.UNROLL_ID] = np.repeat(
+                SampleBatchBuilder._next_unroll_id, batch.count
+            )
+            SampleBatchBuilder._next_unroll_id += 1
+        self.buffers.clear()
+        self.count = 0
+        return batch
+@OldAPIStack
+class MultiAgentSampleBatchBuilder:
+    """Util to build SampleBatches for each policy in a multi-agent env.
+    Input data is per-agent, while output data is per-policy. There is an M:N
+    mapping between agents and policies. We retain one local batch builder
+    per agent. When an agent is done, then its local batch is appended into the
+    corresponding policy batch for the agent's policy.
+    """
+    def __init__(
+        self,
+        policy_map: Dict[PolicyID, Policy],
+        clip_rewards: bool,
+        callbacks: "RLlibCallback",
+    ):
+        """Initialize a MultiAgentSampleBatchBuilder.
+        Args:
+            policy_map (Dict[str,Policy]): Maps policy ids to policy instances.
+            clip_rewards (Union[bool,float]): Whether to clip rewards before
+                postprocessing (at +/-1.0) or the actual value to +/- clip.
+            callbacks: RLlib callbacks.
+        """
+        if log_once("MultiAgentSampleBatchBuilder"):
+            deprecation_warning(old="MultiAgentSampleBatchBuilder", error=False)
+        self.policy_map = policy_map
+        self.clip_rewards = clip_rewards
+        # Build the Policies' SampleBatchBuilders.
+        self.policy_builders = {k: SampleBatchBuilder() for k in policy_map.keys()}
+        # Whenever we observe a new agent, add a new SampleBatchBuilder for
+        # this agent.
+        self.agent_builders = {}
+        # Internal agent-to-policy map.
+        self.agent_to_policy = {}
+        self.callbacks = callbacks
+        # Number of "inference" steps taken in the environment.
+        # Regardless of the number of agents involved in each of these steps.
+        self.count = 0
+    def total(self) -> int:
+        """Returns the total number of steps taken in the env (all agents).
+        Returns:
+            int: The number of steps taken in total in the environment over all
+                agents.
+        """
+        return sum(a.count for a in self.agent_builders.values())
+    def has_pending_agent_data(self) -> bool:
+        """Returns whether there is pending unprocessed data.
+        Returns:
+            bool: True if there is at least one per-agent builder (with data
+                in it).
+        """
+        return len(self.agent_builders) > 0
+    def add_values(self, agent_id: AgentID, policy_id: AgentID, **values: Any) -> None:
+        """Add the given dictionary (row) of values to this batch.
+        Args:
+            agent_id: Unique id for the agent we are adding values for.
+            policy_id: Unique id for policy controlling the agent.
+            values: Row of values to add for this agent.
+        """
+        if agent_id not in self.agent_builders:
+            self.agent_builders[agent_id] = SampleBatchBuilder()
+            self.agent_to_policy[agent_id] = policy_id
+        # Include the current agent id for multi-agent algorithms.
+        if agent_id != _DUMMY_AGENT_ID:
+            values["agent_id"] = agent_id
+        self.agent_builders[agent_id].add_values(**values)
+    def postprocess_batch_so_far(self, episode=None) -> None:
+        """Apply policy postprocessors to any unprocessed rows.
+        This pushes the postprocessed per-agent batches onto the per-policy
+        builders, clearing per-agent state.
+        Args:
+            episode (Optional[Episode]): The Episode object that
+                holds this MultiAgentBatchBuilder object.
+        """
+        # Materialize the batches so far.
+        pre_batches = {}
+        for agent_id, builder in self.agent_builders.items():
+            pre_batches[agent_id] = (
+                self.policy_map[self.agent_to_policy[agent_id]],
+                builder.build_and_reset(),
+            )
+        # Apply postprocessor.
+        post_batches = {}
+        if self.clip_rewards is True:
+            for _, (_, pre_batch) in pre_batches.items():
+                pre_batch["rewards"] = np.sign(pre_batch["rewards"])
+        elif self.clip_rewards:
+            for _, (_, pre_batch) in pre_batches.items():
+                pre_batch["rewards"] = np.clip(
+                    pre_batch["rewards"],
+                    a_min=-self.clip_rewards,
+                    a_max=self.clip_rewards,
+                )
+        for agent_id, (_, pre_batch) in pre_batches.items():
+            other_batches = pre_batches.copy()
+            del other_batches[agent_id]
+            policy = self.policy_map[self.agent_to_policy[agent_id]]
+            if (
+                not pre_batch.is_single_trajectory()
+                or len(set(pre_batch[SampleBatch.EPS_ID])) > 1
+            ):
+                raise ValueError(
+                    "Batches sent to postprocessing must only contain steps "
+                    "from a single trajectory.",
+                    pre_batch,
+                )
+            # Call the Policy's Exploration's postprocess method.
+            post_batches[agent_id] = pre_batch
+            if getattr(policy, "exploration", None) is not None:
+                policy.exploration.postprocess_trajectory(
+                    policy, post_batches[agent_id], policy.get_session()
+                )
+            post_batches[agent_id] = policy.postprocess_trajectory(
+                post_batches[agent_id], other_batches, episode
+            )
+        if log_once("after_post"):
+            logger.info(
+                "Trajectory fragment after postprocess_trajectory():\n\n{}\n".format(
+                    summarize(post_batches)
+                )
+            )
+        # Append into policy batches and reset
+        from ray.rllib.evaluation.rollout_worker import get_global_worker
+        for agent_id, post_batch in sorted(post_batches.items()):
+            self.callbacks.on_postprocess_trajectory(
+                worker=get_global_worker(),
+                episode=episode,
+                agent_id=agent_id,
+                policy_id=self.agent_to_policy[agent_id],
+                policies=self.policy_map,
+                postprocessed_batch=post_batch,
+                original_batches=pre_batches,
+            )
+            self.policy_builders[self.agent_to_policy[agent_id]].add_batch(post_batch)
+        self.agent_builders.clear()
+        self.agent_to_policy.clear()
+    def check_missing_dones(self) -> None:
+        for agent_id, builder in self.agent_builders.items():
+            if not builder.buffers.is_terminated_or_truncated():
+                raise ValueError(
+                    "The environment terminated for all agents, but we still "
+                    "don't have a last observation for "
+                    "agent {} (policy {}). ".format(
+                        agent_id, self.agent_to_policy[agent_id]
+                    )
+                    + "Please ensure that you include the last observations "
+                    "of all live agents when setting '__all__' terminated|truncated "
+                    "to True. "
+                )
+    def build_and_reset(self, episode=None) -> MultiAgentBatch:
+        """Returns the accumulated sample batches for each policy.
+        Any unprocessed rows will be first postprocessed with a policy
+        postprocessor. The internal state of this builder will be reset.
+        Args:
+            episode (Optional[Episode]): The Episode object that
+                holds this MultiAgentBatchBuilder object or None.
+        Returns:
+            MultiAgentBatch: Returns the accumulated sample batches for each
+                policy.
+        """
+        self.postprocess_batch_so_far(episode)
+        policy_batches = {}
+        for policy_id, builder in self.policy_builders.items():
+            if builder.count > 0:
+                policy_batches[policy_id] = builder.build_and_reset()
+        old_count = self.count
+        self.count = 0
+        return MultiAgentBatch.wrap_as_needed(policy_batches, old_count)

.venv/lib/python3.11/site-packages/ray/rllib/evaluation/sampler.py ADDED Viewed

	@@ -0,0 +1,253 @@

+import logging
+import queue
+from abc import ABCMeta, abstractmethod
+from collections import defaultdict, namedtuple
+from typing import (
+    TYPE_CHECKING,
+    Any,
+    List,
+    Optional,
+    Type,
+    Union,
+)
+from ray.rllib.env.base_env import BaseEnv, convert_to_base_env
+from ray.rllib.evaluation.collectors.sample_collector import SampleCollector
+from ray.rllib.evaluation.collectors.simple_list_collector import SimpleListCollector
+from ray.rllib.evaluation.env_runner_v2 import EnvRunnerV2, _PerfStats
+from ray.rllib.evaluation.metrics import RolloutMetrics
+from ray.rllib.offline import InputReader
+from ray.rllib.policy.sample_batch import concat_samples
+from ray.rllib.utils.annotations import OldAPIStack, override
+from ray.rllib.utils.deprecation import deprecation_warning, DEPRECATED_VALUE
+from ray.rllib.utils.framework import try_import_tf
+from ray.rllib.utils.typing import SampleBatchType
+from ray.util.debug import log_once
+if TYPE_CHECKING:
+    from ray.rllib.callbacks.callbacks import RLlibCallback
+    from ray.rllib.evaluation.observation_function import ObservationFunction
+    from ray.rllib.evaluation.rollout_worker import RolloutWorker
+tf1, tf, _ = try_import_tf()
+logger = logging.getLogger(__name__)
+_PolicyEvalData = namedtuple(
+    "_PolicyEvalData",
+    ["env_id", "agent_id", "obs", "info", "rnn_state", "prev_action", "prev_reward"],
+)
+# A batch of RNN states with dimensions [state_index, batch, state_object].
+StateBatch = List[List[Any]]
+class _NewEpisodeDefaultDict(defaultdict):
+    def __missing__(self, env_id):
+        if self.default_factory is None:
+            raise KeyError(env_id)
+        else:
+            ret = self[env_id] = self.default_factory(env_id)
+            return ret
+@OldAPIStack
+class SamplerInput(InputReader, metaclass=ABCMeta):
+    """Reads input experiences from an existing sampler."""
+    @override(InputReader)
+    def next(self) -> SampleBatchType:
+        batches = [self.get_data()]
+        batches.extend(self.get_extra_batches())
+        if len(batches) == 0:
+            raise RuntimeError("No data available from sampler.")
+        return concat_samples(batches)
+    @abstractmethod
+    def get_data(self) -> SampleBatchType:
+        """Called by `self.next()` to return the next batch of data.
+        Override this in child classes.
+        Returns:
+            The next batch of data.
+        """
+        raise NotImplementedError
+    @abstractmethod
+    def get_metrics(self) -> List[RolloutMetrics]:
+        """Returns list of episode metrics since the last call to this method.
+        The list will contain one RolloutMetrics object per completed episode.
+        Returns:
+            List of RolloutMetrics objects, one per completed episode since
+            the last call to this method.
+        """
+        raise NotImplementedError
+    @abstractmethod
+    def get_extra_batches(self) -> List[SampleBatchType]:
+        """Returns list of extra batches since the last call to this method.
+        The list will contain all SampleBatches or
+        MultiAgentBatches that the user has provided thus-far. Users can
+        add these "extra batches" to an episode by calling the episode's
+        `add_extra_batch([SampleBatchType])` method. This can be done from
+        inside an overridden `Policy.compute_actions_from_input_dict(...,
+        episodes)` or from a custom callback's `on_episode_[start|step|end]()`
+        methods.
+        Returns:
+            List of SamplesBatches or MultiAgentBatches provided thus-far by
+            the user since the last call to this method.
+        """
+        raise NotImplementedError
+@OldAPIStack
+class SyncSampler(SamplerInput):
+    """Sync SamplerInput that collects experiences when `get_data()` is called."""
+    def __init__(
+        self,
+        *,
+        worker: "RolloutWorker",
+        env: BaseEnv,
+        clip_rewards: Union[bool, float],
+        rollout_fragment_length: int,
+        count_steps_by: str = "env_steps",
+        callbacks: "RLlibCallback",
+        multiple_episodes_in_batch: bool = False,
+        normalize_actions: bool = True,
+        clip_actions: bool = False,
+        observation_fn: Optional["ObservationFunction"] = None,
+        sample_collector_class: Optional[Type[SampleCollector]] = None,
+        render: bool = False,
+        # Obsolete.
+        policies=None,
+        policy_mapping_fn=None,
+        preprocessors=None,
+        obs_filters=None,
+        tf_sess=None,
+        horizon=DEPRECATED_VALUE,
+        soft_horizon=DEPRECATED_VALUE,
+        no_done_at_end=DEPRECATED_VALUE,
+    ):
+        """Initializes a SyncSampler instance.
+        Args:
+            worker: The RolloutWorker that will use this Sampler for sampling.
+            env: Any Env object. Will be converted into an RLlib BaseEnv.
+            clip_rewards: True for +/-1.0 clipping,
+                actual float value for +/- value clipping. False for no
+                clipping.
+            rollout_fragment_length: The length of a fragment to collect
+                before building a SampleBatch from the data and resetting
+                the SampleBatchBuilder object.
+            count_steps_by: One of "env_steps" (default) or "agent_steps".
+                Use "agent_steps", if you want rollout lengths to be counted
+                by individual agent steps. In a multi-agent env,
+                a single env_step contains one or more agent_steps, depending
+                on how many agents are present at any given time in the
+                ongoing episode.
+            callbacks: The RLlibCallback object to use when episode
+                events happen during rollout.
+            multiple_episodes_in_batch: Whether to pack multiple
+                episodes into each batch. This guarantees batches will be
+                exactly `rollout_fragment_length` in size.
+            normalize_actions: Whether to normalize actions to the
+                action space's bounds.
+            clip_actions: Whether to clip actions according to the
+                given action_space's bounds.
+            observation_fn: Optional multi-agent observation func to use for
+                preprocessing observations.
+            sample_collector_class: An optional SampleCollector sub-class to
+                use to collect, store, and retrieve environment-, model-,
+                and sampler data.
+            render: Whether to try to render the environment after each step.
+        """
+        # All of the following arguments are deprecated. They will instead be
+        # provided via the passed in `worker` arg, e.g. `worker.policy_map`.
+        if log_once("deprecated_sync_sampler_args"):
+            if policies is not None:
+                deprecation_warning(old="policies")
+            if policy_mapping_fn is not None:
+                deprecation_warning(old="policy_mapping_fn")
+            if preprocessors is not None:
+                deprecation_warning(old="preprocessors")
+            if obs_filters is not None:
+                deprecation_warning(old="obs_filters")
+            if tf_sess is not None:
+                deprecation_warning(old="tf_sess")
+            if horizon != DEPRECATED_VALUE:
+                deprecation_warning(old="horizon", error=True)
+            if soft_horizon != DEPRECATED_VALUE:
+                deprecation_warning(old="soft_horizon", error=True)
+            if no_done_at_end != DEPRECATED_VALUE:
+                deprecation_warning(old="no_done_at_end", error=True)
+        self.base_env = convert_to_base_env(env)
+        self.rollout_fragment_length = rollout_fragment_length
+        self.extra_batches = queue.Queue()
+        self.perf_stats = _PerfStats(
+            ema_coef=worker.config.sampler_perf_stats_ema_coef,
+        )
+        if not sample_collector_class:
+            sample_collector_class = SimpleListCollector
+        self.sample_collector = sample_collector_class(
+            worker.policy_map,
+            clip_rewards,
+            callbacks,
+            multiple_episodes_in_batch,
+            rollout_fragment_length,
+            count_steps_by=count_steps_by,
+        )
+        self.render = render
+        # Keep a reference to the underlying EnvRunnerV2 instance for
+        # unit testing purpose.
+        self._env_runner_obj = EnvRunnerV2(
+            worker=worker,
+            base_env=self.base_env,
+            multiple_episodes_in_batch=multiple_episodes_in_batch,
+            callbacks=callbacks,
+            perf_stats=self.perf_stats,
+            rollout_fragment_length=rollout_fragment_length,
+            count_steps_by=count_steps_by,
+            render=self.render,
+        )
+        self._env_runner = self._env_runner_obj.run()
+        self.metrics_queue = queue.Queue()
+    @override(SamplerInput)
+    def get_data(self) -> SampleBatchType:
+        while True:
+            item = next(self._env_runner)
+            if isinstance(item, RolloutMetrics):
+                self.metrics_queue.put(item)
+            else:
+                return item
+    @override(SamplerInput)
+    def get_metrics(self) -> List[RolloutMetrics]:
+        completed = []
+        while True:
+            try:
+                completed.append(
+                    self.metrics_queue.get_nowait()._replace(
+                        perf_stats=self.perf_stats.get()
+                    )
+                )
+            except queue.Empty:
+                break
+        return completed
+    @override(SamplerInput)
+    def get_extra_batches(self) -> List[SampleBatchType]:
+        extra = []
+        while True:
+            try:
+                extra.append(self.extra_batches.get_nowait())
+            except queue.Empty:
+                break
+        return extra

.venv/lib/python3.11/site-packages/ray/rllib/evaluation/worker_set.py ADDED Viewed

	@@ -0,0 +1,10 @@

+from ray.rllib.utils.deprecation import Deprecated
+@Deprecated(
+    new="ray.rllib.env.env_runner_group.EnvRunnerGroup",
+    help="The class has only be renamed w/o any changes in functionality.",
+    error=True,
+)
+class WorkerSet:
+    pass

.venv/lib/python3.11/site-packages/ray/rllib/models/torch/__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (195 Bytes). View file

.venv/lib/python3.11/site-packages/ray/rllib/models/torch/__pycache__/attention_net.cpython-311.pyc ADDED Viewed

Binary file (20.4 kB). View file

.venv/lib/python3.11/site-packages/ray/rllib/models/torch/__pycache__/fcnet.cpython-311.pyc ADDED Viewed

Binary file (6.78 kB). View file

.venv/lib/python3.11/site-packages/ray/rllib/models/torch/__pycache__/mingpt.cpython-311.pyc ADDED Viewed

Binary file (16.4 kB). View file

.venv/lib/python3.11/site-packages/ray/rllib/models/torch/__pycache__/recurrent_net.cpython-311.pyc ADDED Viewed

Binary file (14.2 kB). View file

.venv/lib/python3.11/site-packages/ray/rllib/models/torch/__pycache__/torch_action_dist.cpython-311.pyc ADDED Viewed

Binary file (45.5 kB). View file

.venv/lib/python3.11/site-packages/ray/rllib/models/torch/__pycache__/torch_distributions.cpython-311.pyc ADDED Viewed

Binary file (41.5 kB). View file

.venv/lib/python3.11/site-packages/ray/rllib/offline/__init__.py ADDED Viewed

	@@ -0,0 +1,30 @@

+from ray.rllib.offline.d4rl_reader import D4RLReader
+from ray.rllib.offline.dataset_reader import DatasetReader, get_dataset_and_shards
+from ray.rllib.offline.dataset_writer import DatasetWriter
+from ray.rllib.offline.io_context import IOContext
+from ray.rllib.offline.input_reader import InputReader
+from ray.rllib.offline.mixed_input import MixedInput
+from ray.rllib.offline.json_reader import JsonReader
+from ray.rllib.offline.json_writer import JsonWriter
+from ray.rllib.offline.output_writer import OutputWriter, NoopOutput
+from ray.rllib.offline.resource import get_offline_io_resource_bundles
+from ray.rllib.offline.shuffled_input import ShuffledInput
+from ray.rllib.offline.feature_importance import FeatureImportance
+__all__ = [
+    "IOContext",
+    "JsonReader",
+    "JsonWriter",
+    "NoopOutput",
+    "OutputWriter",
+    "InputReader",
+    "MixedInput",
+    "ShuffledInput",
+    "D4RLReader",
+    "DatasetReader",
+    "DatasetWriter",
+    "get_dataset_and_shards",
+    "get_offline_io_resource_bundles",
+    "FeatureImportance",
+]

.venv/lib/python3.11/site-packages/ray/rllib/offline/__pycache__/dataset_reader.cpython-311.pyc ADDED Viewed

Binary file (14.7 kB). View file

.venv/lib/python3.11/site-packages/ray/rllib/offline/__pycache__/dataset_writer.cpython-311.pyc ADDED Viewed

Binary file (4.25 kB). View file

.venv/lib/python3.11/site-packages/ray/rllib/offline/__pycache__/feature_importance.cpython-311.pyc ADDED Viewed

Binary file (14.3 kB). View file

.venv/lib/python3.11/site-packages/ray/rllib/offline/__pycache__/io_context.cpython-311.pyc ADDED Viewed

Binary file (3.65 kB). View file

.venv/lib/python3.11/site-packages/ray/rllib/offline/__pycache__/is_estimator.cpython-311.pyc ADDED Viewed

Binary file (789 Bytes). View file

.venv/lib/python3.11/site-packages/ray/rllib/offline/__pycache__/json_reader.cpython-311.pyc ADDED Viewed

Binary file (22.9 kB). View file

.venv/lib/python3.11/site-packages/ray/rllib/offline/__pycache__/json_writer.cpython-311.pyc ADDED Viewed

Binary file (8.15 kB). View file

.venv/lib/python3.11/site-packages/ray/rllib/offline/__pycache__/mixed_input.cpython-311.pyc ADDED Viewed

Binary file (3.77 kB). View file

.venv/lib/python3.11/site-packages/ray/rllib/offline/__pycache__/off_policy_estimator.cpython-311.pyc ADDED Viewed

Binary file (587 Bytes). View file

.venv/lib/python3.11/site-packages/ray/rllib/offline/__pycache__/offline_data.cpython-311.pyc ADDED Viewed

Binary file (8.28 kB). View file

.venv/lib/python3.11/site-packages/ray/rllib/offline/__pycache__/offline_env_runner.cpython-311.pyc ADDED Viewed

Binary file (13.4 kB). View file

.venv/lib/python3.11/site-packages/ray/rllib/offline/__pycache__/offline_evaluation_utils.cpython-311.pyc ADDED Viewed

Binary file (6.62 kB). View file

.venv/lib/python3.11/site-packages/ray/rllib/offline/__pycache__/offline_evaluator.cpython-311.pyc ADDED Viewed

Binary file (3.6 kB). View file

.venv/lib/python3.11/site-packages/ray/rllib/offline/__pycache__/offline_prelearner.cpython-311.pyc ADDED Viewed

Binary file (23.4 kB). View file

.venv/lib/python3.11/site-packages/ray/rllib/offline/__pycache__/output_writer.cpython-311.pyc ADDED Viewed

Binary file (1.59 kB). View file

.venv/lib/python3.11/site-packages/ray/rllib/offline/__pycache__/wis_estimator.cpython-311.pyc ADDED Viewed

Binary file (848 Bytes). View file

.venv/lib/python3.11/site-packages/ray/rllib/offline/d4rl_reader.py ADDED Viewed

	@@ -0,0 +1,51 @@

+import logging
+import gymnasium as gym
+from ray.rllib.offline.input_reader import InputReader
+from ray.rllib.offline.io_context import IOContext
+from ray.rllib.policy.sample_batch import SampleBatch
+from ray.rllib.utils.annotations import override, PublicAPI
+from ray.rllib.utils.typing import SampleBatchType
+from typing import Dict
+logger = logging.getLogger(__name__)
+@PublicAPI
+class D4RLReader(InputReader):
+    """Reader object that loads the dataset from the D4RL dataset."""
+    @PublicAPI
+    def __init__(self, inputs: str, ioctx: IOContext = None):
+        """Initializes a D4RLReader instance.
+        Args:
+            inputs: String corresponding to the D4RL environment name.
+            ioctx: Current IO context object.
+        """
+        import d4rl
+        self.env = gym.make(inputs)
+        self.dataset = _convert_to_batch(d4rl.qlearning_dataset(self.env))
+        assert self.dataset.count >= 1
+        self.counter = 0
+    @override(InputReader)
+    def next(self) -> SampleBatchType:
+        if self.counter >= self.dataset.count:
+            self.counter = 0
+        self.counter += 1
+        return self.dataset.slice(start=self.counter, end=self.counter + 1)
+def _convert_to_batch(dataset: Dict) -> SampleBatchType:
+    # Converts D4RL dataset to SampleBatch
+    d = {}
+    d[SampleBatch.OBS] = dataset["observations"]
+    d[SampleBatch.ACTIONS] = dataset["actions"]
+    d[SampleBatch.NEXT_OBS] = dataset["next_observations"]
+    d[SampleBatch.REWARDS] = dataset["rewards"]
+    d[SampleBatch.TERMINATEDS] = dataset["terminals"]
+    return SampleBatch(d)

.venv/lib/python3.11/site-packages/ray/rllib/offline/dataset_reader.py ADDED Viewed

	@@ -0,0 +1,289 @@

+import logging
+import math
+from pathlib import Path
+import re
+import numpy as np
+from typing import List, Tuple, TYPE_CHECKING, Optional
+import zipfile
+import ray.data
+from ray.rllib.offline.input_reader import InputReader
+from ray.rllib.offline.io_context import IOContext
+from ray.rllib.offline.json_reader import from_json_data, postprocess_actions
+from ray.rllib.policy.sample_batch import concat_samples, SampleBatch, DEFAULT_POLICY_ID
+from ray.rllib.utils.annotations import override, PublicAPI
+from ray.rllib.utils.typing import SampleBatchType
+if TYPE_CHECKING:
+    from ray.rllib.algorithms.algorithm_config import AlgorithmConfig
+DEFAULT_NUM_CPUS_PER_TASK = 0.5
+logger = logging.getLogger(__name__)
+def _unzip_this_path(fpath: Path, extract_path: str):
+    with zipfile.ZipFile(str(fpath), "r") as zip_ref:
+        zip_ref.extractall(extract_path)
+def _unzip_if_needed(paths: List[str], format: str):
+    """If a path in paths is a zip file, unzip it and use path of the unzipped file"""
+    ret_paths = []
+    for path in paths:
+        if re.search("\\.zip$", str(path)):
+            # TODO: We need to add unzip support for s3
+            if str(path).startswith("s3://"):
+                raise ValueError(
+                    "unzip_if_needed currently does not support remote paths from s3"
+                )
+            extract_path = "./"
+            try:
+                _unzip_this_path(str(path), extract_path)
+            except FileNotFoundError:
+                # intrepreted as a relative path to rllib folder
+                try:
+                    # TODO: remove this later when we replace all tests with s3 paths
+                    _unzip_this_path(Path(__file__).parent.parent / path, extract_path)
+                except FileNotFoundError:
+                    raise FileNotFoundError(f"File not found: {path}")
+            unzipped_path = str(
+                Path(extract_path).absolute() / f"{Path(path).stem}.{format}"
+            )
+            ret_paths.append(unzipped_path)
+        else:
+            # TODO: We can get rid of this logic when we replace all tests with s3 paths
+            if str(path).startswith("s3://"):
+                ret_paths.append(path)
+            else:
+                if not Path(path).exists():
+                    relative_path = str(Path(__file__).parent.parent / path)
+                    if not Path(relative_path).exists():
+                        raise FileNotFoundError(f"File not found: {path}")
+                    path = relative_path
+                ret_paths.append(path)
+    return ret_paths
+@PublicAPI
+def get_dataset_and_shards(
+    config: "AlgorithmConfig", num_workers: int = 0
+) -> Tuple[ray.data.Dataset, List[ray.data.Dataset]]:
+    """Returns a dataset and a list of shards.
+    This function uses algorithm configs to create a dataset and a list of shards.
+    The following config keys are used to create the dataset:
+        input: The input type should be "dataset".
+        input_config: A dict containing the following key and values:
+            `format`: str, speciifies the format of the input data. This will be the
+            format that ray dataset supports. See ray.data.Dataset for
+            supported formats. Only "parquet" or "json" are supported for now.
+            `paths`: str, a single string or a list of strings. Each string is a path
+            to a file or a directory holding the dataset. It can be either a local path
+            or a remote path (e.g. to an s3 bucket).
+            `loader_fn`: Callable[None, ray.data.Dataset], Instead of
+            specifying paths and format, you can specify a function to load the dataset.
+            `parallelism`: int, The number of tasks to use for loading the dataset.
+            If not specified, it will be set to the number of workers.
+            `num_cpus_per_read_task`: float, The number of CPUs to use for each read
+            task. If not specified, it will be set to 0.5.
+    Args:
+        config: The config dict for the algorithm.
+        num_workers: The number of shards to create for remote workers.
+    Returns:
+        dataset: The dataset object.
+        shards: A list of dataset shards. For num_workers > 0 the first returned
+        shared would be a dummy None shard for local_worker.
+    """
+    # check input and input config keys
+    assert config.input_ == "dataset", (
+        f"Must specify config.input_ as 'dataset' if"
+        f" calling `get_dataset_and_shards`. Got {config.input_}"
+    )
+    # check input config format
+    input_config = config.input_config
+    format = input_config.get("format")
+    supported_fmts = ["json", "parquet"]
+    if format is not None and format not in supported_fmts:
+        raise ValueError(
+            f"Unsupported format {format}. Supported formats are {supported_fmts}"
+        )
+    # check paths and loader_fn since only one of them is required.
+    paths = input_config.get("paths")
+    loader_fn = input_config.get("loader_fn")
+    if loader_fn and (format or paths):
+        raise ValueError(
+            "When using a `loader_fn`, you cannot specify a `format` or `path`."
+        )
+    # check if at least loader_fn or format + path is specified.
+    if not (format and paths) and not loader_fn:
+        raise ValueError(
+            "Must specify either a `loader_fn` or a `format` and `path` in "
+            "`input_config`."
+        )
+    # check paths to be a str or list[str] if not None
+    if paths is not None:
+        if isinstance(paths, str):
+            paths = [paths]
+        elif isinstance(paths, list):
+            assert isinstance(paths[0], str), "Paths must be a list of path strings."
+        else:
+            raise ValueError("Paths must be a path string or a list of path strings.")
+        paths = _unzip_if_needed(paths, format)
+    # TODO (Kourosh): num_workers is not necessary since we can use parallelism for
+    # everything. Having two parameters is confusing here. Remove num_workers later.
+    parallelism = input_config.get("parallelism", num_workers or 1)
+    cpus_per_task = input_config.get(
+        "num_cpus_per_read_task", DEFAULT_NUM_CPUS_PER_TASK
+    )
+    if loader_fn:
+        dataset = loader_fn()
+    elif format == "json":
+        dataset = ray.data.read_json(
+            paths, parallelism=parallelism, ray_remote_args={"num_cpus": cpus_per_task}
+        )
+    elif format == "parquet":
+        dataset = ray.data.read_parquet(
+            paths, parallelism=parallelism, ray_remote_args={"num_cpus": cpus_per_task}
+        )
+    else:
+        raise ValueError("Un-supported Ray dataset format: ", format)
+    # Local worker will be responsible for sampling.
+    if num_workers == 0:
+        # Dataset is the only shard we need.
+        return dataset, [dataset]
+    # Remote workers are responsible for sampling:
+    else:
+        # Each remote worker gets 1 shard.
+        remote_shards = dataset.repartition(
+            num_blocks=num_workers, shuffle=False
+        ).split(num_workers)
+        # The first None shard is for the local worker, which
+        # shouldn't be doing rollout work anyways.
+        return dataset, [None] + remote_shards
+@PublicAPI
+class DatasetReader(InputReader):
+    """Reader object that loads data from Ray Dataset.
+    Examples:
+        config = {
+            "input": "dataset",
+            "input_config": {
+                "format": "json",
+                # A single data file, a directory, or anything
+                # that ray.data.dataset recognizes.
+                "paths": "/tmp/sample_batches/",
+                # By default, parallelism=num_workers.
+                "parallelism": 3,
+                # Dataset allocates 0.5 CPU for each reader by default.
+                # Adjust this value based on the size of your offline dataset.
+                "num_cpus_per_read_task": 0.5,
+            }
+        }
+    """
+    @PublicAPI
+    def __init__(self, ds: ray.data.Dataset, ioctx: Optional[IOContext] = None):
+        """Initializes a DatasetReader instance.
+        Args:
+            ds: Ray dataset to sample from.
+        """
+        self._ioctx = ioctx or IOContext()
+        self._default_policy = self.policy_map = None
+        self.preprocessor = None
+        self._dataset = ds
+        self.count = None if not self._dataset else self._dataset.count()
+        # do this to disable the ray data stdout logging
+        ray.data.DataContext.get_current().enable_progress_bars = False
+        # the number of steps to return per call to next()
+        self.batch_size = self._ioctx.config.get("train_batch_size", 1)
+        num_workers = self._ioctx.config.get("num_env_runners", 0)
+        seed = self._ioctx.config.get("seed", None)
+        if num_workers:
+            self.batch_size = max(math.ceil(self.batch_size / num_workers), 1)
+        # We allow the creation of a non-functioning None DatasetReader.
+        # It's useful for example for a non-rollout local worker.
+        if ds:
+            if self._ioctx.worker is not None:
+                self._policy_map = self._ioctx.worker.policy_map
+                self._default_policy = self._policy_map.get(DEFAULT_POLICY_ID)
+                self.preprocessor = (
+                    self._ioctx.worker.preprocessors.get(DEFAULT_POLICY_ID)
+                    if not self._ioctx.config.get("_disable_preprocessors", False)
+                    else None
+                )
+            print(
+                f"DatasetReader {self._ioctx.worker_index} has {ds.count()}, samples."
+            )
+            def iterator():
+                while True:
+                    ds = self._dataset.random_shuffle(seed=seed)
+                    yield from ds.iter_rows()
+            self._iter = iterator()
+        else:
+            self._iter = None
+    @override(InputReader)
+    def next(self) -> SampleBatchType:
+        # next() should not get called on None DatasetReader.
+        assert self._iter is not None
+        ret = []
+        count = 0
+        while count < self.batch_size:
+            d = next(self._iter)
+            # Columns like obs are compressed when written by DatasetWriter.
+            d = from_json_data(d, self._ioctx.worker)
+            count += d.count
+            d = self._preprocess_if_needed(d)
+            d = postprocess_actions(d, self._ioctx)
+            d = self._postprocess_if_needed(d)
+            ret.append(d)
+        ret = concat_samples(ret)
+        return ret
+    def _preprocess_if_needed(self, batch: SampleBatchType) -> SampleBatchType:
+        # TODO: @kourosh, preprocessor is only supported for single agent case.
+        if self.preprocessor:
+            for key in (SampleBatch.CUR_OBS, SampleBatch.NEXT_OBS):
+                if key in batch:
+                    batch[key] = np.stack(
+                        [self.preprocessor.transform(s) for s in batch[key]]
+                    )
+        return batch
+    def _postprocess_if_needed(self, batch: SampleBatchType) -> SampleBatchType:
+        if not self._ioctx.config.get("postprocess_inputs"):
+            return batch
+        if isinstance(batch, SampleBatch):
+            out = []
+            for sub_batch in batch.split_by_episode():
+                if self._default_policy is not None:
+                    out.append(self._default_policy.postprocess_trajectory(sub_batch))
+                else:
+                    out.append(sub_batch)
+            return concat_samples(out)
+        else:
+            # TODO(ekl) this is trickier since the alignments between agent
+            #  trajectories in the episode are not available any more.
+            raise NotImplementedError(
+                "Postprocessing of multi-agent data not implemented yet."
+            )

.venv/lib/python3.11/site-packages/ray/rllib/offline/dataset_writer.py ADDED Viewed

	@@ -0,0 +1,82 @@

+import logging
+import os
+import time
+from ray import data
+from ray.rllib.offline.io_context import IOContext
+from ray.rllib.offline.json_writer import _to_json_dict
+from ray.rllib.offline.output_writer import OutputWriter
+from ray.rllib.utils.annotations import override, PublicAPI
+from ray.rllib.utils.typing import SampleBatchType
+from typing import Dict, List
+logger = logging.getLogger(__name__)
+@PublicAPI
+class DatasetWriter(OutputWriter):
+    """Writer object that saves experiences using Datasets."""
+    @PublicAPI
+    def __init__(
+        self,
+        ioctx: IOContext = None,
+        compress_columns: List[str] = frozenset(["obs", "new_obs"]),
+    ):
+        """Initializes a DatasetWriter instance.
+        Examples:
+        config = {
+            "output": "dataset",
+            "output_config": {
+                "format": "json",
+                "path": "/tmp/test_samples/",
+                "max_num_samples_per_file": 100000,
+            }
+        }
+        Args:
+            ioctx: current IO context object.
+            compress_columns: list of sample batch columns to compress.
+        """
+        self.ioctx = ioctx or IOContext()
+        output_config: Dict = ioctx.output_config
+        assert (
+            "format" in output_config
+        ), "output_config.format must be specified when using Dataset output."
+        assert (
+            "path" in output_config
+        ), "output_config.path must be specified when using Dataset output."
+        self.format = output_config["format"]
+        self.path = os.path.abspath(os.path.expanduser(output_config["path"]))
+        self.max_num_samples_per_file = (
+            output_config["max_num_samples_per_file"]
+            if "max_num_samples_per_file" in output_config
+            else 100000
+        )
+        self.compress_columns = compress_columns
+        self.samples = []
+    @override(OutputWriter)
+    def write(self, sample_batch: SampleBatchType):
+        start = time.time()
+        # Make sure columns like obs are compressed and writable.
+        d = _to_json_dict(sample_batch, self.compress_columns)
+        self.samples.append(d)
+        # Todo: We should flush at the end of sampling even if this
+        # condition was not reached.
+        if len(self.samples) >= self.max_num_samples_per_file:
+            ds = data.from_items(self.samples).repartition(num_blocks=1, shuffle=False)
+            if self.format == "json":
+                ds.write_json(self.path, try_create_dir=True)
+            elif self.format == "parquet":
+                ds.write_parquet(self.path, try_create_dir=True)
+            else:
+                raise ValueError("Unknown output type: ", self.format)
+            self.samples = []
+            logger.debug("Wrote dataset in {}s".format(time.time() - start))

.venv/lib/python3.11/site-packages/ray/rllib/offline/estimators/__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (802 Bytes). View file

.venv/lib/python3.11/site-packages/ray/rllib/offline/estimators/__pycache__/direct_method.cpython-311.pyc ADDED Viewed

Binary file (8.83 kB). View file

.venv/lib/python3.11/site-packages/ray/rllib/offline/estimators/__pycache__/fqe_torch_model.cpython-311.pyc ADDED Viewed

Binary file (15.9 kB). View file