koichi12 commited on Feb 12, 2025

Commit

6b42d14

verified ·

1 Parent(s): 747c195

Add files using upload-large-folder tool

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.venv/lib/python3.11/site-packages/ray/rllib/utils/__pycache__/__init__.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/rllib/utils/__pycache__/actor_manager.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/rllib/utils/__pycache__/actors.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/rllib/utils/__pycache__/annotations.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/rllib/utils/__pycache__/error.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/rllib/utils/__pycache__/filter.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/rllib/utils/__pycache__/filter_manager.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/rllib/utils/__pycache__/framework.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/rllib/utils/__pycache__/images.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/rllib/utils/__pycache__/minibatch_utils.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/rllib/utils/__pycache__/numpy.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/rllib/utils/__pycache__/policy.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/rllib/utils/__pycache__/sgd.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/rllib/utils/__pycache__/tensor_dtype.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/rllib/utils/__pycache__/test_utils.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/rllib/utils/__pycache__/tf_run_builder.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/rllib/utils/__pycache__/tf_utils.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/rllib/utils/__pycache__/threading.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/rllib/utils/__pycache__/typing.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/rllib/utils/metrics/__init__.py +158 -0
.venv/lib/python3.11/site-packages/ray/rllib/utils/metrics/__pycache__/__init__.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/rllib/utils/metrics/__pycache__/learner_info.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/rllib/utils/metrics/__pycache__/metrics_logger.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/rllib/utils/metrics/__pycache__/stats.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/rllib/utils/metrics/__pycache__/window_stat.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/rllib/utils/metrics/learner_info.py +120 -0
.venv/lib/python3.11/site-packages/ray/rllib/utils/metrics/metrics_logger.py +1186 -0
.venv/lib/python3.11/site-packages/ray/rllib/utils/metrics/stats.py +757 -0
.venv/lib/python3.11/site-packages/ray/rllib/utils/metrics/window_stat.py +79 -0
.venv/lib/python3.11/site-packages/ray/rllib/utils/replay_buffers/__init__.py +44 -0
.venv/lib/python3.11/site-packages/ray/rllib/utils/replay_buffers/__pycache__/__init__.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/rllib/utils/replay_buffers/__pycache__/base.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/rllib/utils/replay_buffers/__pycache__/episode_replay_buffer.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/rllib/utils/replay_buffers/__pycache__/fifo_replay_buffer.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/rllib/utils/replay_buffers/__pycache__/multi_agent_episode_buffer.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/rllib/utils/replay_buffers/__pycache__/multi_agent_mixin_replay_buffer.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/rllib/utils/replay_buffers/__pycache__/multi_agent_prioritized_episode_buffer.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/rllib/utils/replay_buffers/__pycache__/multi_agent_prioritized_replay_buffer.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/rllib/utils/replay_buffers/__pycache__/multi_agent_replay_buffer.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/rllib/utils/replay_buffers/__pycache__/prioritized_episode_buffer.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/rllib/utils/replay_buffers/__pycache__/prioritized_replay_buffer.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/rllib/utils/replay_buffers/__pycache__/replay_buffer.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/rllib/utils/replay_buffers/__pycache__/reservoir_replay_buffer.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/rllib/utils/replay_buffers/__pycache__/simple_replay_buffer.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/rllib/utils/replay_buffers/__pycache__/utils.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/rllib/utils/replay_buffers/base.py +76 -0
.venv/lib/python3.11/site-packages/ray/rllib/utils/replay_buffers/episode_replay_buffer.py +1098 -0
.venv/lib/python3.11/site-packages/ray/rllib/utils/replay_buffers/fifo_replay_buffer.py +109 -0
.venv/lib/python3.11/site-packages/ray/rllib/utils/replay_buffers/multi_agent_episode_buffer.py +1026 -0
.venv/lib/python3.11/site-packages/ray/rllib/utils/replay_buffers/multi_agent_mixin_replay_buffer.py +404 -0

.venv/lib/python3.11/site-packages/ray/rllib/utils/__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (4.64 kB). View file

.venv/lib/python3.11/site-packages/ray/rllib/utils/__pycache__/actor_manager.cpython-311.pyc ADDED Viewed

Binary file (43.3 kB). View file

.venv/lib/python3.11/site-packages/ray/rllib/utils/__pycache__/actors.cpython-311.pyc ADDED Viewed

Binary file (12.5 kB). View file

.venv/lib/python3.11/site-packages/ray/rllib/utils/__pycache__/annotations.cpython-311.pyc ADDED Viewed

Binary file (8.13 kB). View file

.venv/lib/python3.11/site-packages/ray/rllib/utils/__pycache__/error.cpython-311.pyc ADDED Viewed

Binary file (5.81 kB). View file

.venv/lib/python3.11/site-packages/ray/rllib/utils/__pycache__/filter.cpython-311.pyc ADDED Viewed

Binary file (20.5 kB). View file

.venv/lib/python3.11/site-packages/ray/rllib/utils/__pycache__/filter_manager.cpython-311.pyc ADDED Viewed

Binary file (4.39 kB). View file

.venv/lib/python3.11/site-packages/ray/rllib/utils/__pycache__/framework.cpython-311.pyc ADDED Viewed

Binary file (16.3 kB). View file

.venv/lib/python3.11/site-packages/ray/rllib/utils/__pycache__/images.cpython-311.pyc ADDED Viewed

Binary file (2.87 kB). View file

.venv/lib/python3.11/site-packages/ray/rllib/utils/__pycache__/minibatch_utils.cpython-311.pyc ADDED Viewed

Binary file (16.6 kB). View file

.venv/lib/python3.11/site-packages/ray/rllib/utils/__pycache__/numpy.cpython-311.pyc ADDED Viewed

Binary file (27.9 kB). View file

.venv/lib/python3.11/site-packages/ray/rllib/utils/__pycache__/policy.cpython-311.pyc ADDED Viewed

Binary file (14.7 kB). View file

.venv/lib/python3.11/site-packages/ray/rllib/utils/__pycache__/sgd.cpython-311.pyc ADDED Viewed

Binary file (5.36 kB). View file

.venv/lib/python3.11/site-packages/ray/rllib/utils/__pycache__/tensor_dtype.cpython-311.pyc ADDED Viewed

Binary file (2.98 kB). View file

.venv/lib/python3.11/site-packages/ray/rllib/utils/__pycache__/test_utils.cpython-311.pyc ADDED Viewed

Binary file (73.8 kB). View file

.venv/lib/python3.11/site-packages/ray/rllib/utils/__pycache__/tf_run_builder.cpython-311.pyc ADDED Viewed

Binary file (6.6 kB). View file

.venv/lib/python3.11/site-packages/ray/rllib/utils/__pycache__/tf_utils.cpython-311.pyc ADDED Viewed

Binary file (36.5 kB). View file

.venv/lib/python3.11/site-packages/ray/rllib/utils/__pycache__/threading.cpython-311.pyc ADDED Viewed

Binary file (1.76 kB). View file

.venv/lib/python3.11/site-packages/ray/rllib/utils/__pycache__/typing.cpython-311.pyc ADDED Viewed

Binary file (8.39 kB). View file

.venv/lib/python3.11/site-packages/ray/rllib/utils/metrics/__init__.py ADDED Viewed

	@@ -0,0 +1,158 @@

+from ray.rllib.core import ALL_MODULES  # noqa
+# Algorithm ResultDict keys.
+AGGREGATOR_ACTOR_RESULTS = "aggregator_actors"
+EVALUATION_RESULTS = "evaluation"
+ENV_RUNNER_RESULTS = "env_runners"
+REPLAY_BUFFER_RESULTS = "replay_buffer"
+LEARNER_GROUP = "learner_group"
+LEARNER_RESULTS = "learners"
+FAULT_TOLERANCE_STATS = "fault_tolerance"
+TIMERS = "timers"
+# RLModule metrics.
+NUM_TRAINABLE_PARAMETERS = "num_trainable_parameters"
+NUM_NON_TRAINABLE_PARAMETERS = "num_non_trainable_parameters"
+# Number of times `training_step()` was called in one iteration.
+NUM_TRAINING_STEP_CALLS_PER_ITERATION = "num_training_step_calls_per_iteration"
+# Counters for sampling, sampling (on eval workers) and
+# training steps (env- and agent steps).
+MEAN_NUM_EPISODE_LISTS_RECEIVED = "mean_num_episode_lists_received"
+NUM_AGENT_STEPS_SAMPLED = "num_agent_steps_sampled"
+NUM_AGENT_STEPS_SAMPLED_LIFETIME = "num_agent_steps_sampled_lifetime"
+NUM_AGENT_STEPS_SAMPLED_THIS_ITER = "num_agent_steps_sampled_this_iter"  # @OldAPIStack
+NUM_ENV_STEPS_SAMPLED = "num_env_steps_sampled"
+NUM_ENV_STEPS_SAMPLED_LIFETIME = "num_env_steps_sampled_lifetime"
+NUM_ENV_STEPS_SAMPLED_PER_SECOND = "num_env_steps_sampled_per_second"
+NUM_ENV_STEPS_SAMPLED_THIS_ITER = "num_env_steps_sampled_this_iter"  # @OldAPIStack
+NUM_ENV_STEPS_SAMPLED_FOR_EVALUATION_THIS_ITER = (
+    "num_env_steps_sampled_for_evaluation_this_iter"
+)
+NUM_MODULE_STEPS_SAMPLED = "num_module_steps_sampled"
+NUM_MODULE_STEPS_SAMPLED_LIFETIME = "num_module_steps_sampled_lifetime"
+ENV_TO_MODULE_SUM_EPISODES_LENGTH_IN = "env_to_module_sum_episodes_length_in"
+ENV_TO_MODULE_SUM_EPISODES_LENGTH_OUT = "env_to_module_sum_episodes_length_out"
+# Counters for adding and evicting in replay buffers.
+ACTUAL_N_STEP = "actual_n_step"
+AGENT_ACTUAL_N_STEP = "agent_actual_n_step"
+AGENT_STEP_UTILIZATION = "agent_step_utilization"
+ENV_STEP_UTILIZATION = "env_step_utilization"
+NUM_AGENT_EPISODES_STORED = "num_agent_episodes"
+NUM_AGENT_EPISODES_ADDED = "num_agent_episodes_added"
+NUM_AGENT_EPISODES_ADDED_LIFETIME = "num_agent_episodes_added_lifetime"
+NUM_AGENT_EPISODES_EVICTED = "num_agent_episodes_evicted"
+NUM_AGENT_EPISODES_EVICTED_LIFETIME = "num_agent_episodes_evicted_lifetime"
+NUM_AGENT_EPISODES_PER_SAMPLE = "num_agent_episodes_per_sample"
+NUM_AGENT_RESAMPLES = "num_agent_resamples"
+NUM_AGENT_STEPS_ADDED = "num_agent_steps_added"
+NUM_AGENT_STEPS_ADDED_LIFETIME = "num_agent_steps_added_lifetime"
+NUM_AGENT_STEPS_EVICTED = "num_agent_steps_evicted"
+NUM_AGENT_STEPS_EVICTED_LIFETIME = "num_agent_steps_evicted_lifetime"
+NUM_AGENT_STEPS_PER_SAMPLE = "num_agent_steps_per_sample"
+NUM_AGENT_STEPS_PER_SAMPLE_LIFETIME = "num_agent_steps_per_sample_lifetime"
+NUM_AGENT_STEPS_STORED = "num_agent_steps"
+NUM_ENV_STEPS_STORED = "num_env_steps"
+NUM_ENV_STEPS_ADDED = "num_env_steps_added"
+NUM_ENV_STEPS_ADDED_LIFETIME = "num_env_steps_added_lifetime"
+NUM_ENV_STEPS_EVICTED = "num_env_steps_evicted"
+NUM_ENV_STEPS_EVICTED_LIFETIME = "num_env_steps_evicted_lifetime"
+NUM_ENV_STEPS_PER_SAMPLE = "num_env_steps_per_sample"
+NUM_ENV_STEPS_PER_SAMPLE_LIFETIME = "num_env_steps_per_sample_lifetime"
+NUM_EPISODES_STORED = "num_episodes"
+NUM_EPISODES_ADDED = "num_episodes_added"
+NUM_EPISODES_ADDED_LIFETIME = "num_episodes_added_lifetime"
+NUM_EPISODES_EVICTED = "num_episodes_evicted"
+NUM_EPISODES_EVICTED_LIFETIME = "num_episodes_evicted_lifetime"
+NUM_EPISODES_PER_SAMPLE = "num_episodes_per_sample"
+NUM_RESAMPLES = "num_resamples"
+EPISODE_DURATION_SEC_MEAN = "episode_duration_sec_mean"
+EPISODE_LEN_MEAN = "episode_len_mean"
+EPISODE_LEN_MAX = "episode_len_max"
+EPISODE_LEN_MIN = "episode_len_min"
+EPISODE_RETURN_MEAN = "episode_return_mean"
+EPISODE_RETURN_MAX = "episode_return_max"
+EPISODE_RETURN_MIN = "episode_return_min"
+NUM_EPISODES = "num_episodes"
+NUM_EPISODES_LIFETIME = "num_episodes_lifetime"
+TIME_BETWEEN_SAMPLING = "time_between_sampling"
+MEAN_NUM_LEARNER_GROUP_UPDATE_CALLED = "mean_num_learner_group_update_called"
+MEAN_NUM_LEARNER_GROUP_RESULTS_RECEIVED = "mean_num_learner_group_results_received"
+NUM_AGENT_STEPS_TRAINED = "num_agent_steps_trained"
+NUM_AGENT_STEPS_TRAINED_LIFETIME = "num_agent_steps_trained_lifetime"
+NUM_AGENT_STEPS_TRAINED_THIS_ITER = "num_agent_steps_trained_this_iter"  # @OldAPIStack
+NUM_ENV_STEPS_TRAINED = "num_env_steps_trained"
+NUM_ENV_STEPS_TRAINED_LIFETIME = "num_env_steps_trained_lifetime"
+NUM_ENV_STEPS_TRAINED_THIS_ITER = "num_env_steps_trained_this_iter"  # @OldAPIStack
+NUM_MODULE_STEPS_TRAINED = "num_module_steps_trained"
+NUM_MODULE_STEPS_TRAINED_LIFETIME = "num_module_steps_trained_lifetime"
+MODULE_TRAIN_BATCH_SIZE_MEAN = "module_train_batch_size_mean"
+LEARNER_CONNECTOR_SUM_EPISODES_LENGTH_IN = "learner_connector_sum_episodes_length_in"
+LEARNER_CONNECTOR_SUM_EPISODES_LENGTH_OUT = "learner_connector_sum_episodes_length_out"
+# Backward compatibility: Replace with num_env_steps_... or num_agent_steps_...
+STEPS_TRAINED_THIS_ITER_COUNTER = "num_steps_trained_this_iter"
+# Counters for keeping track of worker weight updates (synchronization
+# between local worker and remote workers).
+NUM_SYNCH_WORKER_WEIGHTS = "num_weight_broadcasts"
+NUM_TRAINING_STEP_CALLS_SINCE_LAST_SYNCH_WORKER_WEIGHTS = (
+    "num_training_step_calls_since_last_synch_worker_weights"
+)
+# The running sequence number for a set of NN weights. If a worker's NN has a
+# lower sequence number than some weights coming in for an update, the worker
+# should perform the update, otherwise ignore the incoming weights (they are older
+# or the same) as/than the ones it already has.
+WEIGHTS_SEQ_NO = "weights_seq_no"
+# Number of total gradient updates that have been performed on a policy.
+NUM_GRAD_UPDATES_LIFETIME = "num_grad_updates_lifetime"
+# Average difference between the number of grad-updates that the policy/ies had
+# that collected the training batch vs the policy that was just updated (trained).
+# Good measure for the off-policy'ness of training. Should be 0.0 for PPO and PG,
+# small for IMPALA and APPO, and any (larger) value for DQN and other off-policy algos.
+DIFF_NUM_GRAD_UPDATES_VS_SAMPLER_POLICY = "diff_num_grad_updates_vs_sampler_policy"
+# Counters to track target network updates.
+LAST_TARGET_UPDATE_TS = "last_target_update_ts"
+NUM_TARGET_UPDATES = "num_target_updates"
+# Performance timers
+# ------------------
+# Duration of n `Algorithm.training_step()` calls making up one "iteration".
+# Note that n may be >1 if the user has set up a min time (sec) or timesteps per
+# iteration.
+TRAINING_ITERATION_TIMER = "training_iteration"
+# Duration of a `Algorithm.evaluate()` call.
+EVALUATION_ITERATION_TIMER = "evaluation_iteration"
+# Duration of a single `training_step()` call.
+TRAINING_STEP_TIMER = "training_step"
+APPLY_GRADS_TIMER = "apply_grad"
+COMPUTE_GRADS_TIMER = "compute_grads"
+GARBAGE_COLLECTION_TIMER = "garbage_collection"
+RESTORE_ENV_RUNNERS_TIMER = "restore_env_runners"
+RESTORE_EVAL_ENV_RUNNERS_TIMER = "restore_eval_env_runners"
+SYNCH_WORKER_WEIGHTS_TIMER = "synch_weights"
+SYNCH_ENV_CONNECTOR_STATES_TIMER = "synch_env_connectors"
+SYNCH_EVAL_ENV_CONNECTOR_STATES_TIMER = "synch_eval_env_connectors"
+GRAD_WAIT_TIMER = "grad_wait"
+SAMPLE_TIMER = "sample"  # @OldAPIStack
+ENV_RUNNER_SAMPLING_TIMER = "env_runner_sampling_timer"
+OFFLINE_SAMPLING_TIMER = "offline_sampling_timer"
+REPLAY_BUFFER_ADD_DATA_TIMER = "replay_buffer_add_data_timer"
+REPLAY_BUFFER_SAMPLE_TIMER = "replay_buffer_sampling_timer"
+REPLAY_BUFFER_UPDATE_PRIOS_TIMER = "replay_buffer_update_prios_timer"
+LEARNER_UPDATE_TIMER = "learner_update_timer"
+LEARN_ON_BATCH_TIMER = "learn"  # @OldAPIStack
+LOAD_BATCH_TIMER = "load"
+TARGET_NET_UPDATE_TIMER = "target_net_update"
+CONNECTOR_TIMERS = "connectors"
+# Learner.
+LEARNER_STATS_KEY = "learner_stats"
+TD_ERROR_KEY = "td_error"

.venv/lib/python3.11/site-packages/ray/rllib/utils/metrics/__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (7.2 kB). View file

.venv/lib/python3.11/site-packages/ray/rllib/utils/metrics/__pycache__/learner_info.cpython-311.pyc ADDED Viewed

Binary file (5.87 kB). View file

.venv/lib/python3.11/site-packages/ray/rllib/utils/metrics/__pycache__/metrics_logger.cpython-311.pyc ADDED Viewed

Binary file (59.3 kB). View file

.venv/lib/python3.11/site-packages/ray/rllib/utils/metrics/__pycache__/stats.cpython-311.pyc ADDED Viewed

Binary file (34.4 kB). View file

.venv/lib/python3.11/site-packages/ray/rllib/utils/metrics/__pycache__/window_stat.cpython-311.pyc ADDED Viewed

Binary file (3.93 kB). View file

.venv/lib/python3.11/site-packages/ray/rllib/utils/metrics/learner_info.py ADDED Viewed

	@@ -0,0 +1,120 @@

+from collections import defaultdict
+import numpy as np
+import tree  # pip install dm_tree
+from typing import Dict
+from ray.rllib.policy.sample_batch import DEFAULT_POLICY_ID
+from ray.rllib.utils.annotations import OldAPIStack
+from ray.rllib.utils.typing import PolicyID
+# Instant metrics (keys for metrics.info).
+LEARNER_INFO = "learner"
+# By convention, metrics from optimizing the loss can be reported in the
+# `grad_info` dict returned by learn_on_batch() / compute_grads() via this key.
+LEARNER_STATS_KEY = "learner_stats"
+@OldAPIStack
+class LearnerInfoBuilder:
+    def __init__(self, num_devices: int = 1):
+        self.num_devices = num_devices
+        self.results_all_towers = defaultdict(list)
+        self.is_finalized = False
+    def add_learn_on_batch_results(
+        self,
+        results: Dict,
+        policy_id: PolicyID = DEFAULT_POLICY_ID,
+    ) -> None:
+        """Adds a policy.learn_on_(loaded)?_batch() result to this builder.
+        Args:
+            results: The results returned by Policy.learn_on_batch or
+                Policy.learn_on_loaded_batch.
+            policy_id: The policy's ID, whose learn_on_(loaded)_batch method
+                returned `results`.
+        """
+        assert (
+            not self.is_finalized
+        ), "LearnerInfo already finalized! Cannot add more results."
+        # No towers: Single CPU.
+        if "tower_0" not in results:
+            self.results_all_towers[policy_id].append(results)
+        # Multi-GPU case:
+        else:
+            self.results_all_towers[policy_id].append(
+                tree.map_structure_with_path(
+                    lambda p, *s: _all_tower_reduce(p, *s),
+                    *(
+                        results.pop("tower_{}".format(tower_num))
+                        for tower_num in range(self.num_devices)
+                    )
+                )
+            )
+            for k, v in results.items():
+                if k == LEARNER_STATS_KEY:
+                    for k1, v1 in results[k].items():
+                        self.results_all_towers[policy_id][-1][LEARNER_STATS_KEY][
+                            k1
+                        ] = v1
+                else:
+                    self.results_all_towers[policy_id][-1][k] = v
+    def add_learn_on_batch_results_multi_agent(
+        self,
+        all_policies_results: Dict,
+    ) -> None:
+        """Adds multiple policy.learn_on_(loaded)?_batch() results to this builder.
+        Args:
+            all_policies_results: The results returned by all Policy.learn_on_batch or
+                Policy.learn_on_loaded_batch wrapped as a dict mapping policy ID to
+                results.
+        """
+        for pid, result in all_policies_results.items():
+            if pid != "batch_count":
+                self.add_learn_on_batch_results(result, policy_id=pid)
+    def finalize(self):
+        self.is_finalized = True
+        info = {}
+        for policy_id, results_all_towers in self.results_all_towers.items():
+            # Reduce mean across all minibatch SGD steps (axis=0 to keep
+            # all shapes as-is).
+            info[policy_id] = tree.map_structure_with_path(
+                _all_tower_reduce, *results_all_towers
+            )
+        return info
+@OldAPIStack
+def _all_tower_reduce(path, *tower_data):
+    """Reduces stats across towers based on their stats-dict paths."""
+    # TD-errors: Need to stay per batch item in order to be able to update
+    # each item's weight in a prioritized replay buffer.
+    if len(path) == 1 and path[0] == "td_error":
+        return np.concatenate(tower_data, axis=0)
+    elif tower_data[0] is None:
+        return None
+    if isinstance(path[-1], str):
+        # TODO(sven): We need to fix this terrible dependency on `str.starts_with`
+        #  for determining, how to aggregate these stats! As "num_..." might
+        #  be a good indicator for summing, it will fail if the stats is e.g.
+        #  `num_samples_per_sec" :)
+        # Counter stats: Reduce sum.
+        # if path[-1].startswith("num_"):
+        #    return np.nansum(tower_data)
+        # Min stats: Reduce min.
+        if path[-1].startswith("min_"):
+            return np.nanmin(tower_data)
+        # Max stats: Reduce max.
+        elif path[-1].startswith("max_"):
+            return np.nanmax(tower_data)
+    if np.isnan(tower_data).all():
+        return np.nan
+    # Everything else: Reduce mean.
+    return np.nanmean(tower_data)

.venv/lib/python3.11/site-packages/ray/rllib/utils/metrics/metrics_logger.py ADDED Viewed

	@@ -0,0 +1,1186 @@

+import copy
+import logging
+from typing import Any, Dict, List, Optional, Tuple, Union
+import tree  # pip install dm_tree
+from ray.rllib.utils import force_tuple
+from ray.rllib.utils.metrics.stats import Stats
+from ray.rllib.utils.framework import try_import_tf, try_import_torch
+from ray.util.annotations import PublicAPI
+_, tf, _ = try_import_tf()
+torch, _ = try_import_torch()
+logger = logging.getLogger("ray.rllib")
+@PublicAPI(stability="alpha")
+class MetricsLogger:
+    """A generic class collecting and processing metrics in RL training and evaluation.
+    This class represents the main API used by all of RLlib's components (internal and
+    user facing) in order to log, collect, and process (reduce) stats during training
+    and evaluation/inference.
+    It supports:
+    - Logging of simple float/int values (for example a loss) over time or from
+    parallel runs (n Learner workers, each one reporting a loss from their respective
+    data shard).
+    - Logging of images, videos, or other more complex data structures over time.
+    - Reducing these collected values using a user specified reduction method (for
+    example "min" or "mean") and other settings controlling the reduction and internal
+    data, such as sliding windows or EMA coefficients.
+    - Optionally clearing all logged values after a `reduce()` call to make space for
+    new data.
+    .. testcode::
+        import time
+        from ray.rllib.utils.metrics.metrics_logger import MetricsLogger
+        from ray.rllib.utils.test_utils import check
+        logger = MetricsLogger()
+        # 1) Logging float values (mean over window):
+        # Log some loss under the "loss" key. By default, all logged values
+        # under that key are averaged and reported back, once `reduce()` is called.
+        logger.log_value("loss", 0.001, reduce="mean", window=10)
+        logger.log_value("loss", 0.002)  # <- no need to repeat arg/options on same key
+        # Peek at the current (reduced) value of "loss":
+        check(logger.peek("loss"), 0.0015)  # <- expect average value
+        # Actually reduce the underlying Stats object(s).
+        results = logger.reduce()
+        check(results["loss"], 0.0015)
+        # 2) Logging float values (minimum over window):
+        # Log the minimum of loss values under the "min_loss" key.
+        logger.log_value("min_loss", 0.1, reduce="min", window=2)
+        logger.log_value("min_loss", 0.01)
+        logger.log_value("min_loss", 0.1)
+        logger.log_value("min_loss", 0.02)
+        # Peek at the current (reduced) value of "min_loss":
+        check(logger.peek("min_loss"), 0.02)  # <- expect min value (over window=2)
+        # Actually reduce the underlying Stats object(s).
+        results = logger.reduce()
+        check(results["min_loss"], 0.02)
+        # 3) Log n counts in different (remote?) components and merge them on the
+        # controller side.
+        remote_logger_1 = MetricsLogger()
+        remote_logger_2 = MetricsLogger()
+        main_logger = MetricsLogger()
+        remote_logger_1.log_value("count", 2, reduce="sum", clear_on_reduce=True)
+        remote_logger_2.log_value("count", 3, reduce="sum", clear_on_reduce=True)
+        # Reduce the two remote loggers ..
+        remote_results_1 = remote_logger_1.reduce()
+        remote_results_2 = remote_logger_2.reduce()
+        # .. then merge the two results into the controller logger.
+        main_logger.merge_and_log_n_dicts([remote_results_1, remote_results_2])
+        check(main_logger.peek("count"), 5)
+        # 4) Time blocks of code using EMA (coeff=0.1). Note that the higher the coeff
+        # (the closer to 1.0), the more short term the EMA turns out.
+        logger = MetricsLogger()
+        # First delta measurement:
+        with logger.log_time("my_block_to_be_timed", reduce="mean", ema_coeff=0.1):
+            time.sleep(1.0)
+        # EMA should be ~1sec.
+        assert 1.1 > logger.peek("my_block_to_be_timed") > 0.9
+        # Second delta measurement (note that we don't have to repeat the args again, as
+        # the stats under that name have already been created above with the correct
+        # args).
+        with logger.log_time("my_block_to_be_timed"):
+            time.sleep(2.0)
+        # EMA should be ~1.1sec.
+        assert 1.15 > logger.peek("my_block_to_be_timed") > 1.05
+        # When calling `reduce()`, the internal values list gets cleaned up (reduced)
+        # and reduction results are returned.
+        results = logger.reduce()
+        # EMA should be ~1.1sec.
+        assert 1.15 > results["my_block_to_be_timed"] > 1.05
+    """
+    def __init__(self):
+        """Initializes a MetricsLogger instance."""
+        self.stats = {}
+        self._tensor_mode = False
+        self._tensor_keys = set()
+        # TODO (sven): We use a dummy RLock here for most RLlib algos, however, APPO
+        #  and IMPALA require this to be an actual RLock (b/c of thread safety reasons).
+        #  An actual RLock, however, breaks our current OfflineData and
+        #  OfflinePreLearner logic, in which the Learner (which contains a
+        #  MetricsLogger) is serialized and deserialized. We will have to fix this
+        #  offline RL logic first, then can remove this hack here and return to always
+        #  using the RLock.
+        self._threading_lock = _DummyRLock()
+    def __contains__(self, key: Union[str, Tuple[str, ...]]) -> bool:
+        """Returns True, if `key` can be found in self.stats.
+        Args:
+            key: The key to find in self.stats. This must be either a str (single,
+                top-level key) or a tuple of str (nested key).
+        Returns:
+            Whether `key` could be found in self.stats.
+        """
+        return self._key_in_stats(key)
+    def peek(
+        self,
+        key: Union[str, Tuple[str, ...]],
+        *,
+        default: Optional[Any] = None,
+        throughput: bool = False,
+    ) -> Any:
+        """Returns the (reduced) value(s) found under the given key or key sequence.
+        If `key` only reaches to a nested dict deeper in `self`, that
+        sub-dictionary's entire values are returned as a (nested) dict with its leafs
+        being the reduced peek values.
+        Note that calling this method does NOT cause an actual underlying value list
+        reduction, even though reduced values are being returned. It'll keep all
+        internal structures as-is.
+        .. testcode::
+            from ray.rllib.utils.metrics.metrics_logger import MetricsLogger
+            from ray.rllib.utils.test_utils import check
+            logger = MetricsLogger()
+            ema = 0.01
+            # Log some (EMA reduced) values.
+            key = ("some", "nested", "key", "sequence")
+            logger.log_value(key, 2.0, ema_coeff=ema)
+            logger.log_value(key, 3.0)
+            # Expected reduced value:
+            expected_reduced = (1.0 - ema) * 2.0 + ema * 3.0
+            # Peek at the (reduced) value under `key`.
+            check(logger.peek(key), expected_reduced)
+            # Peek at the (reduced) nested struct under ("some", "nested").
+            check(
+                logger.peek(("some", "nested")),
+                {"key": {"sequence": expected_reduced}},
+            )
+            # Log some more, check again.
+            logger.log_value(key, 4.0)
+            expected_reduced = (1.0 - ema) * expected_reduced + ema * 4.0
+            check(logger.peek(key), expected_reduced)
+        Args:
+            key: The key/key sequence of the sub-structure of `self`, whose (reduced)
+                values to return.
+            default: An optional default value in case `key` cannot be found in `self`.
+                If default is not provided and `key` cannot be found, throws a KeyError.
+            throughput: Whether to return the current throughput estimate instead of the
+                actual (reduced) value.
+        Returns:
+            The (reduced) values of the (possibly nested) sub-structure found under
+            the given `key` or key sequence.
+        Raises:
+            KeyError: If `key` cannot be found AND `default` is not provided.
+        """
+        # Use default value, b/c `key` cannot be found in our stats.
+        if not self._key_in_stats(key) and default is not None:
+            return default
+        # Otherwise, return the reduced Stats' (peek) value.
+        struct = self._get_key(key)
+        # Create a reduced view of the requested sub-structure or leaf (Stats object).
+        with self._threading_lock:
+            if isinstance(struct, Stats):
+                return struct.peek(throughput=throughput)
+            ret = tree.map_structure(
+                lambda s: s.peek(throughput=throughput),
+                struct.copy(),
+            )
+            return ret
+    @staticmethod
+    def peek_results(results: Any) -> Any:
+        """Performs `peek()` on any leaf element of an arbitrarily nested Stats struct.
+        Args:
+            results: The nested structure of Stats-leafs to be peek'd and returned.
+        Returns:
+            A corresponding structure of the peek'd `results` (reduced float/int values;
+            no Stats objects).
+        """
+        return tree.map_structure(
+            lambda s: s.peek() if isinstance(s, Stats) else s, results
+        )
+    def log_value(
+        self,
+        key: Union[str, Tuple[str, ...]],
+        value: Any,
+        *,
+        reduce: Optional[str] = "mean",
+        window: Optional[Union[int, float]] = None,
+        ema_coeff: Optional[float] = None,
+        clear_on_reduce: bool = False,
+        with_throughput: bool = False,
+    ) -> None:
+        """Logs a new value under a (possibly nested) key to the logger.
+        .. testcode::
+            from ray.rllib.utils.metrics.metrics_logger import MetricsLogger
+            from ray.rllib.utils.test_utils import check
+            logger = MetricsLogger()
+            # Log n simple float values under the "loss" key. By default, all logged
+            # values under that key are averaged, once `reduce()` is called.
+            logger.log_value("loss", 0.01, window=10)
+            logger.log_value("loss", 0.02)  # don't have to repeat `window` if key
+                                            # already exists
+            logger.log_value("loss", 0.03)
+            # Peek at the current (reduced) value.
+            # Note that in the underlying structure, the internal values list still
+            # contains all logged values (0.01, 0.02, and 0.03).
+            check(logger.peek("loss"), 0.02)
+            # Log 10x (window size) the same value.
+            for _ in range(10):
+                logger.log_value("loss", 0.05)
+            check(logger.peek("loss"), 0.05)
+            # Internals check (note that users should not be concerned with accessing
+            # these).
+            check(len(logger.stats["loss"].values), 13)
+            # Only, when we call `reduce` does the underlying structure get "cleaned
+            # up". In this case, the list is shortened to 10 items (window size).
+            results = logger.reduce(return_stats_obj=False)
+            check(results, {"loss": 0.05})
+            check(len(logger.stats["loss"].values), 10)
+            # Log a value under a deeper nested key.
+            logger.log_value(("some", "nested", "key"), -1.0)
+            check(logger.peek(("some", "nested", "key")), -1.0)
+            # Log n values without reducing them (we want to just collect some items).
+            logger.log_value("some_items", 5.0, reduce=None)
+            logger.log_value("some_items", 6.0)
+            logger.log_value("some_items", 7.0)
+            # Peeking at these returns the full list of items (no reduction set up).
+            check(logger.peek("some_items"), [5.0, 6.0, 7.0])
+            # If you don't want the internal list to grow indefinitely, you should set
+            # `clear_on_reduce=True`:
+            logger.log_value("some_more_items", -5.0, reduce=None, clear_on_reduce=True)
+            logger.log_value("some_more_items", -6.0)
+            logger.log_value("some_more_items", -7.0)
+            # Peeking at these returns the full list of items (no reduction set up).
+            check(logger.peek("some_more_items"), [-5.0, -6.0, -7.0])
+            # Reducing everything (and return plain values, not `Stats` objects).
+            results = logger.reduce(return_stats_obj=False)
+            check(results, {
+                "loss": 0.05,
+                "some": {
+                    "nested": {
+                        "key": -1.0,
+                    },
+                },
+                "some_items": [5.0, 6.0, 7.0],  # reduce=None; list as-is
+                "some_more_items": [-5.0, -6.0, -7.0],  # reduce=None; list as-is
+            })
+            # However, the `reduce()` call did empty the `some_more_items` list
+            # (b/c we set `clear_on_reduce=True`).
+            check(logger.peek("some_more_items"), [])
+            # ... but not the "some_items" list (b/c `clear_on_reduce=False`).
+            check(logger.peek("some_items"), [])
+        Args:
+            key: The key (or nested key-tuple) to log the `value` under.
+            value: The value to log.
+            reduce: The reduction method to apply, once `self.reduce()` is called.
+                If None, will collect all logged values under `key` in a list (and
+                also return that list upon calling `self.reduce()`).
+            window: An optional window size to reduce over.
+                If not None, then the reduction operation is only applied to the most
+                recent `window` items, and - after reduction - the internal values list
+                under `key` is shortened to hold at most `window` items (the most
+                recent ones).
+                Must be None if `ema_coeff` is provided.
+                If None (and `ema_coeff` is None), reduction must not be "mean".
+            ema_coeff: An optional EMA coefficient to use if `reduce` is "mean"
+                and no `window` is provided. Note that if both `window` and `ema_coeff`
+                are provided, an error is thrown. Also, if `ema_coeff` is provided,
+                `reduce` must be "mean".
+                The reduction formula for EMA is:
+                EMA(t1) = (1.0 - ema_coeff) * EMA(t0) + ema_coeff * new_value
+            clear_on_reduce: If True, all values under `key` will be emptied after
+                `self.reduce()` is called. Setting this to True is useful for cases,
+                in which the internal values list would otherwise grow indefinitely,
+                for example if reduce is None and there is no `window` provided.
+            with_throughput: Whether to track a throughput estimate together with this
+                metric. This is only supported for `reduce=sum` and
+                `clear_on_reduce=False` metrics (aka. "lifetime counts"). The `Stats`
+                object under the logged key then keeps track of the time passed
+                between two consecutive calls to `reduce()` and update its throughput
+                estimate. The current throughput estimate of a key can be obtained
+                through: peeked_value, throuthput_per_sec =
+                <MetricsLogger>.peek([key], throughput=True).
+        """
+        # No reduction (continue appending to list) AND no window.
+        # -> We'll force-reset our values upon `reduce()`.
+        if reduce is None and (window is None or window == float("inf")):
+            clear_on_reduce = True
+        self._check_tensor(key, value)
+        with self._threading_lock:
+            # `key` doesn't exist -> Automatically create it.
+            if not self._key_in_stats(key):
+                self._set_key(
+                    key,
+                    (
+                        Stats.similar_to(value, init_value=value.values)
+                        if isinstance(value, Stats)
+                        else Stats(
+                            value,
+                            reduce=reduce,
+                            window=window,
+                            ema_coeff=ema_coeff,
+                            clear_on_reduce=clear_on_reduce,
+                            throughput=with_throughput,
+                        )
+                    ),
+                )
+            # If value itself is a `Stats`, we merge it on time axis into self's
+            # `Stats`.
+            elif isinstance(value, Stats):
+                self._get_key(key).merge_on_time_axis(value)
+            # Otherwise, we just push the value into self's `Stats`.
+            else:
+                self._get_key(key).push(value)
+    def log_dict(
+        self,
+        stats_dict,
+        *,
+        key: Optional[Union[str, Tuple[str, ...]]] = None,
+        reduce: Optional[str] = "mean",
+        window: Optional[Union[int, float]] = None,
+        ema_coeff: Optional[float] = None,
+        clear_on_reduce: bool = False,
+    ) -> None:
+        """Logs all leafs (`Stats` or simple values) of a (nested) dict to this logger.
+        Traverses through all leafs of `stats_dict` and - if a path cannot be found in
+        this logger yet, will add the `Stats` found at the leaf under that new key.
+        If a path already exists, will merge the found leaf (`Stats`) with the ones
+        already logged before. This way, `stats_dict` does NOT have to have
+        the same structure as what has already been logged to `self`, but can be used to
+        log values under new keys or nested key paths.
+        .. testcode::
+            from ray.rllib.utils.metrics.metrics_logger import MetricsLogger
+            from ray.rllib.utils.test_utils import check
+            logger = MetricsLogger()
+            # Log n dicts with keys "a" and (some) "b". By default, all logged values
+            # under that key are averaged, once `reduce()` is called.
+            logger.log_dict(
+                {
+                    "a": 0.1,
+                    "b": -0.1,
+                },
+                window=10,
+            )
+            logger.log_dict({
+                "b": -0.2,
+            })  # don't have to repeat `window` arg if key already exists
+            logger.log_dict({
+                "a": 0.2,
+                "c": {"d": 5.0},  # can also introduce an entirely new (nested) key
+            })
+            # Peek at the current (reduced) values under "a" and "b".
+            check(logger.peek("a"), 0.15)
+            check(logger.peek("b"), -0.15)
+            check(logger.peek(("c", "d")), 5.0)
+            # Reduced all stats.
+            results = logger.reduce(return_stats_obj=False)
+            check(results, {
+                "a": 0.15,
+                "b": -0.15,
+                "c": {"d": 5.0},
+            })
+        Args:
+            stats_dict: The (possibly nested) dict with `Stats` or individual values as
+                leafs to be logged to this logger.
+            key: An additional key (or tuple of keys) to prepend to all the keys
+                (or tuples of keys in case of nesting) found inside `stats_dict`.
+                Useful to log the entire contents of `stats_dict` in a more organized
+                fashion under one new key, for example logging the results returned by
+                an EnvRunner under key
+            reduce: The reduction method to apply, once `self.reduce()` is called.
+                If None, will collect all logged values under `key` in a list (and
+                also return that list upon calling `self.reduce()`).
+            window: An optional window size to reduce over.
+                If not None, then the reduction operation is only applied to the most
+                recent `window` items, and - after reduction - the internal values list
+                under `key` is shortened to hold at most `window` items (the most
+                recent ones).
+                Must be None if `ema_coeff` is provided.
+                If None (and `ema_coeff` is None), reduction must not be "mean".
+            ema_coeff: An optional EMA coefficient to use if `reduce` is "mean"
+                and no `window` is provided. Note that if both `window` and `ema_coeff`
+                are provided, an error is thrown. Also, if `ema_coeff` is provided,
+                `reduce` must be "mean".
+                The reduction formula for EMA is:
+                EMA(t1) = (1.0 - ema_coeff) * EMA(t0) + ema_coeff * new_value
+            clear_on_reduce: If True, all values under `key` will be emptied after
+                `self.reduce()` is called. Setting this to True is useful for cases,
+                in which the internal values list would otherwise grow indefinitely,
+                for example if reduce is None and there is no `window` provided.
+        """
+        assert isinstance(
+            stats_dict, dict
+        ), f"`stats_dict` ({stats_dict}) must be dict!"
+        prefix_key = force_tuple(key)
+        def _map(path, stat_or_value):
+            extended_key = prefix_key + force_tuple(tree.flatten(path))
+            self.log_value(
+                extended_key,
+                stat_or_value,
+                reduce=reduce,
+                window=window,
+                ema_coeff=ema_coeff,
+                clear_on_reduce=clear_on_reduce,
+            )
+        with self._threading_lock:
+            tree.map_structure_with_path(_map, stats_dict)
+    def merge_and_log_n_dicts(
+        self,
+        stats_dicts: List[Dict[str, Any]],
+        *,
+        key: Optional[Union[str, Tuple[str, ...]]] = None,
+        # TODO (sven): Maybe remove these args. They don't seem to make sense in this
+        #  method. If we do so, values in the dicts must be Stats instances, though.
+        reduce: Optional[str] = "mean",
+        window: Optional[Union[int, float]] = None,
+        ema_coeff: Optional[float] = None,
+        clear_on_reduce: bool = False,
+    ) -> None:
+        """Merges n dicts, generated by n parallel components, and logs the results.
+        .. testcode::
+            from ray.rllib.utils.metrics.metrics_logger import MetricsLogger
+            from ray.rllib.utils.test_utils import check
+            # Example: n Learners logging loss stats to be merged.
+            # Note that losses should usually be logged with a window=1 so they don't
+            # get smeared over time and instead provide an accurate picture of the
+            # current situation.
+            main_logger = MetricsLogger()
+            logger_learner1 = MetricsLogger()
+            logger_learner1.log_value("loss", 0.1, window=1)
+            learner1_results = logger_learner1.reduce()
+            logger_learner2 = MetricsLogger()
+            logger_learner2.log_value("loss", 0.2, window=1)
+            learner2_results = logger_learner2.reduce()
+            # Merge the stats from both Learners.
+            main_logger.merge_and_log_n_dicts(
+                [learner1_results, learner2_results],
+                key="learners",
+            )
+            check(main_logger.peek(("learners", "loss")), 0.15)
+            # Example: m EnvRunners logging episode returns to be merged.
+            main_logger = MetricsLogger()
+            logger_env_runner1 = MetricsLogger()
+            logger_env_runner1.log_value("mean_ret", 100.0, window=3)
+            logger_env_runner1.log_value("mean_ret", 200.0)
+            logger_env_runner1.log_value("mean_ret", 300.0)
+            logger_env_runner1.log_value("mean_ret", 400.0)
+            env_runner1_results = logger_env_runner1.reduce()
+            logger_env_runner2 = MetricsLogger()
+            logger_env_runner2.log_value("mean_ret", 150.0, window=3)
+            logger_env_runner2.log_value("mean_ret", 250.0)
+            logger_env_runner2.log_value("mean_ret", 350.0)
+            logger_env_runner2.log_value("mean_ret", 450.0)
+            env_runner2_results = logger_env_runner2.reduce()
+            # Merge the stats from both EnvRunners.
+            main_logger.merge_and_log_n_dicts(
+                [env_runner1_results, env_runner2_results],
+                key="env_runners",
+            )
+            # The expected procedure is as follows:
+            # The individual internal values lists of the two loggers are as follows:
+            # env runner 1: [100, 200, 300, 400]
+            # env runner 2: [150, 250, 350, 450]
+            # Move backwards from index=-1 (each time, loop through both env runners)
+            # index=-1 -> [400, 450] -> reduce-mean -> [425] -> repeat 2 times (number
+            #   of env runners) -> [425, 425]
+            # index=-2 -> [300, 350] -> reduce-mean -> [325] -> repeat 2 times
+            #   -> append -> [425, 425, 325, 325] -> STOP b/c we have reached >= window.
+            # reverse the list -> [325, 325, 425, 425]
+            check(
+                main_logger.stats["env_runners"]["mean_ret"].values,
+                [325, 325, 425, 425],
+            )
+            check(main_logger.peek(("env_runners", "mean_ret")), (325 + 425 + 425) / 3)
+            # Example: Lifetime sum over n parallel components' stats.
+            main_logger = MetricsLogger()
+            logger1 = MetricsLogger()
+            logger1.log_value("some_stat", 50, reduce="sum", window=None)
+            logger1.log_value("some_stat", 25, reduce="sum", window=None)
+            logger1_results = logger1.reduce()
+            logger2 = MetricsLogger()
+            logger2.log_value("some_stat", 75, reduce="sum", window=None)
+            logger2_results = logger2.reduce()
+            # Merge the stats from both Learners.
+            main_logger.merge_and_log_n_dicts([logger1_results, logger2_results])
+            check(main_logger.peek("some_stat"), 150)
+            # Example: Sum over n parallel components' stats with a window of 3.
+            main_logger = MetricsLogger()
+            logger1 = MetricsLogger()
+            logger1.log_value("some_stat", 50, reduce="sum", window=3)
+            logger1.log_value("some_stat", 25, reduce="sum")
+            logger1.log_value("some_stat", 10, reduce="sum")
+            logger1.log_value("some_stat", 5, reduce="sum")
+            logger1_results = logger1.reduce()
+            logger2 = MetricsLogger()
+            logger2.log_value("some_stat", 75, reduce="sum", window=3)
+            logger2.log_value("some_stat", 100, reduce="sum")
+            logger2_results = logger2.reduce()
+            # Merge the stats from both Learners.
+            main_logger.merge_and_log_n_dicts([logger1_results, logger2_results])
+            # The expected procedure is as follows:
+            # The individual internal values lists of the two loggers are as follows:
+            # env runner 1: [50, 25, 10, 5]
+            # env runner 2: [75, 100]
+            # Move backwards from index=-1 (each time, loop through both loggers)
+            # index=-1 -> [5, 100] -> leave as-is, b/c we are sum'ing -> [5, 100]
+            # index=-2 -> [10, 75] -> leave as-is -> [5, 100, 10, 75] -> STOP b/c we
+            # have reached >= window.
+            # reverse the list -> [75, 10, 100, 5]
+            check(main_logger.peek("some_stat"), 115)  # last 3 items (window) get sum'd
+        Args:
+            stats_dicts: List of n stats dicts to be merged and then logged.
+            key: Optional top-level key under which to log all keys/key sequences
+                found in the n `stats_dicts`.
+            reduce: The reduction method to apply, once `self.reduce()` is called.
+                If None, will collect all logged values under `key` in a list (and
+                also return that list upon calling `self.reduce()`).
+            window: An optional window size to reduce over.
+                If not None, then the reduction operation is only applied to the most
+                recent `window` items, and - after reduction - the internal values list
+                under `key` is shortened to hold at most `window` items (the most
+                recent ones).
+                Must be None if `ema_coeff` is provided.
+                If None (and `ema_coeff` is None), reduction must not be "mean".
+            ema_coeff: An optional EMA coefficient to use if `reduce` is "mean"
+                and no `window` is provided. Note that if both `window` and `ema_coeff`
+                are provided, an error is thrown. Also, if `ema_coeff` is provided,
+                `reduce` must be "mean".
+                The reduction formula for EMA is:
+                EMA(t1) = (1.0 - ema_coeff) * EMA(t0) + ema_coeff * new_value
+            clear_on_reduce: If True, all values under `key` will be emptied after
+                `self.reduce()` is called. Setting this to True is useful for cases,
+                in which the internal values list would otherwise grow indefinitely,
+                for example if reduce is None and there is no `window` provided.
+        """
+        prefix_key = force_tuple(key)
+        all_keys = set()
+        for stats_dict in stats_dicts:
+            tree.map_structure_with_path(
+                lambda path, _: all_keys.add(force_tuple(path)),
+                stats_dict,
+            )
+        # No reduction (continue appending to list) AND no window.
+        # -> We'll force-reset our values upon `reduce()`.
+        if reduce is None and (window is None or window == float("inf")):
+            clear_on_reduce = True
+        for key in all_keys:
+            extended_key = prefix_key + key
+            available_stats = [
+                self._get_key(key, stats=s)
+                for s in stats_dicts
+                if self._key_in_stats(key, stats=s)
+            ]
+            base_stats = None
+            more_stats = []
+            for i, stat_or_value in enumerate(available_stats):
+                # Value is NOT a Stats object -> Convert it to one.
+                if not isinstance(stat_or_value, Stats):
+                    self._check_tensor(extended_key, stat_or_value)
+                    available_stats[i] = stat_or_value = Stats(
+                        stat_or_value,
+                        reduce=reduce,
+                        window=window,
+                        ema_coeff=ema_coeff,
+                        clear_on_reduce=clear_on_reduce,
+                    )
+                # Create a new Stats object to merge everything into as parallel,
+                # equally weighted Stats.
+                if base_stats is None:
+                    base_stats = Stats.similar_to(
+                        stat_or_value,
+                        init_value=stat_or_value.values,
+                    )
+                else:
+                    more_stats.append(stat_or_value)
+            # Special case: `base_stats` is a lifetime sum (reduce=sum,
+            # clear_on_reduce=False) -> We subtract the previous value (from 2
+            # `reduce()` calls ago) from all to-be-merged stats, so we don't count
+            # twice the older sum from before.
+            if (
+                base_stats._reduce_method == "sum"
+                and base_stats._window is None
+                and base_stats._clear_on_reduce is False
+            ):
+                for stat in [base_stats] + more_stats:
+                    stat.push(-stat.peek(previous=2))
+            # There are more than one incoming parallel others -> Merge all of them
+            # first in parallel.
+            if len(more_stats) > 0:
+                base_stats.merge_in_parallel(*more_stats)
+            # `key` not in self yet -> Store merged stats under the new key.
+            if not self._key_in_stats(extended_key):
+                self._set_key(extended_key, base_stats)
+            # `key` already exists in `self` -> Merge `base_stats` into self's entry
+            # on time axis, meaning give the incoming values priority over already
+            # existing ones.
+            else:
+                self._get_key(extended_key).merge_on_time_axis(base_stats)
+    def log_time(
+        self,
+        key: Union[str, Tuple[str, ...]],
+        *,
+        reduce: Optional[str] = "mean",
+        window: Optional[Union[int, float]] = None,
+        ema_coeff: Optional[float] = None,
+        clear_on_reduce: bool = False,
+    ) -> Stats:
+        """Measures and logs a time delta value under `key` when used with a with-block.
+        .. testcode::
+            import time
+            from ray.rllib.utils.metrics.metrics_logger import MetricsLogger
+            from ray.rllib.utils.test_utils import check
+            logger = MetricsLogger()
+            # First delta measurement:
+            with logger.log_time("my_block_to_be_timed", reduce="mean", ema_coeff=0.1):
+                time.sleep(1.0)
+            # EMA should be ~1sec.
+            assert 1.1 > logger.peek("my_block_to_be_timed") > 0.9
+            # Second delta measurement (note that we don't have to repeat the args
+            # again, as the stats under that name have already been created above with
+            # the correct args).
+            with logger.log_time("my_block_to_be_timed"):
+                time.sleep(2.0)
+            # EMA should be ~1.1sec.
+            assert 1.15 > logger.peek("my_block_to_be_timed") > 1.05
+            # When calling `reduce()`, the internal values list gets cleaned up.
+            check(len(logger.stats["my_block_to_be_timed"].values), 2)  # still 2 deltas
+            results = logger.reduce()
+            check(len(logger.stats["my_block_to_be_timed"].values), 1)  # reduced to 1
+            # EMA should be ~1.1sec.
+            assert 1.15 > results["my_block_to_be_timed"] > 1.05
+        Args:
+            key: The key (or tuple of keys) to log the measured time delta under.
+            reduce: The reduction method to apply, once `self.reduce()` is called.
+                If None, will collect all logged values under `key` in a list (and
+                also return that list upon calling `self.reduce()`).
+            window: An optional window size to reduce over.
+                If not None, then the reduction operation is only applied to the most
+                recent `window` items, and - after reduction - the internal values list
+                under `key` is shortened to hold at most `window` items (the most
+                recent ones).
+                Must be None if `ema_coeff` is provided.
+                If None (and `ema_coeff` is None), reduction must not be "mean".
+            ema_coeff: An optional EMA coefficient to use if `reduce` is "mean"
+                and no `window` is provided. Note that if both `window` and `ema_coeff`
+                are provided, an error is thrown. Also, if `ema_coeff` is provided,
+                `reduce` must be "mean".
+                The reduction formula for EMA is:
+                EMA(t1) = (1.0 - ema_coeff) * EMA(t0) + ema_coeff * new_value
+            clear_on_reduce: If True, all values under `key` will be emptied after
+                `self.reduce()` is called. Setting this to True is useful for cases,
+                in which the internal values list would otherwise grow indefinitely,
+                for example if reduce is None and there is no `window` provided.
+        """
+        # No reduction (continue appending to list) AND no window.
+        # -> We'll force-reset our values upon `reduce()`.
+        if reduce is None and (window is None or window == float("inf")):
+            clear_on_reduce = True
+        if not self._key_in_stats(key):
+            self._set_key(
+                key,
+                Stats(
+                    reduce=reduce,
+                    window=window,
+                    ema_coeff=ema_coeff,
+                    clear_on_reduce=clear_on_reduce,
+                ),
+            )
+        # Return the Stats object, so a `with` clause can enter and exit it.
+        return self._get_key(key)
+    def reduce(
+        self,
+        key: Optional[Union[str, Tuple[str, ...]]] = None,
+        *,
+        return_stats_obj: bool = True,
+    ) -> Dict:
+        """Reduces all logged values based on their settings and returns a result dict.
+        DO NOT CALL THIS METHOD under normal circumstances! RLlib's components call it
+        right before a distinct step has been completed and the (MetricsLogger-based)
+        results of that step need to be passed upstream to other components for further
+        processing.
+        The returned result dict has the exact same structure as the logged keys (or
+        nested key sequences) combined. At the leafs of the returned structure are
+        either `Stats` objects (`return_stats_obj=True`, which is the default) or
+        primitive (non-Stats) values (`return_stats_obj=False`). In case of
+        `return_stats_obj=True`, the returned dict with `Stats` at the leafs can
+        conveniently be re-used upstream for further logging and reduction operations.
+        For example, imagine component A (e.g. an Algorithm) containing a MetricsLogger
+        and n remote components (e.g. n EnvRunners), each with their own
+        MetricsLogger object. Component A calls its n remote components, each of
+        which returns an equivalent, reduced dict with `Stats` as leafs.
+        Component A can then further log these n result dicts through its own
+        MetricsLogger through:
+        `logger.merge_and_log_n_dicts([n returned result dicts from n subcomponents])`.
+        The returned result dict has the exact same structure as the logged keys (or
+        nested key sequences) combined. At the leafs of the returned structure are
+        either `Stats` objects (`return_stats_obj=True`, which is the default) or
+        primitive (non-Stats) values (`return_stats_obj=False`). In case of
+        `return_stats_obj=True`, the returned dict with Stats at the leafs can be
+        reused conveniently  downstream for further logging and reduction operations.
+        For example, imagine component A (e.g. an Algorithm) containing a MetricsLogger
+        and n remote components (e.g. n EnvRunner workers), each with their own
+        MetricsLogger object. Component A calls its n remote components, each of
+        which returns an equivalent, reduced dict with `Stats` instances as leafs.
+        Component A can now further log these n result dicts through its own
+        MetricsLogger:
+        `logger.merge_and_log_n_dicts([n returned result dicts from the remote
+        components])`.
+        .. testcode::
+            from ray.rllib.utils.metrics.metrics_logger import MetricsLogger
+            from ray.rllib.utils.test_utils import check
+            # Log some (EMA reduced) values.
+            logger = MetricsLogger()
+            logger.log_value("a", 2.0)
+            logger.log_value("a", 3.0)
+            expected_reduced = (1.0 - 0.01) * 2.0 + 0.01 * 3.0
+            # Reduce and return primitive values (not Stats objects).
+            results = logger.reduce(return_stats_obj=False)
+            check(results, {"a": expected_reduced})
+            # Log some values to be averaged with a sliding window.
+            logger = MetricsLogger()
+            logger.log_value("a", 2.0, window=2)
+            logger.log_value("a", 3.0)
+            logger.log_value("a", 4.0)
+            expected_reduced = (3.0 + 4.0) / 2  # <- win size is only 2; first logged
+                                                # item not used
+            # Reduce and return primitive values (not Stats objects).
+            results = logger.reduce(return_stats_obj=False)
+            check(results, {"a": expected_reduced})
+            # Assume we have 2 remote components, each one returning an equivalent
+            # reduced dict when called. We can simply use these results and log them
+            # to our own MetricsLogger, then reduce over these 2 logged results.
+            comp1_logger = MetricsLogger()
+            comp1_logger.log_value("a", 1.0, window=10)
+            comp1_logger.log_value("a", 2.0)
+            result1 = comp1_logger.reduce()  # <- return Stats objects as leafs
+            comp2_logger = MetricsLogger()
+            comp2_logger.log_value("a", 3.0, window=10)
+            comp2_logger.log_value("a", 4.0)
+            result2 = comp2_logger.reduce()  # <- return Stats objects as leafs
+            # Now combine the 2 equivalent results into 1 end result dict.
+            downstream_logger = MetricsLogger()
+            downstream_logger.merge_and_log_n_dicts([result1, result2])
+            # What happens internally is that both values lists of the 2 components
+            # are merged (concat'd) and randomly shuffled, then clipped at 10 (window
+            # size). This is done such that no component has an "advantage" over the
+            # other as we don't know the exact time-order in which these parallelly
+            # running components logged their own "a"-values.
+            # We execute similarly useful merging strategies for other reduce settings,
+            # such as EMA, max/min/sum-reducing, etc..
+            end_result = downstream_logger.reduce(return_stats_obj=False)
+            check(end_result, {"a": 2.5})
+        Args:
+            key: Optional key or key sequence (for nested location within self.stats),
+                limiting the reduce operation to that particular sub-structure of self.
+                If None, will reduce all of self's Stats.
+            return_stats_obj: Whether in the returned dict, the leafs should be Stats
+                objects. This is the default as it enables users to continue using
+                (and further logging) the results of this call inside another
+                (downstream) MetricsLogger object.
+        Returns:
+            A (nested) dict matching the structure of `self.stats` (contains all ever
+            logged keys to this MetricsLogger) with the leafs being (reduced) Stats
+            objects if `return_stats_obj=True` or primitive values, carrying no
+            reduction and history information, if `return_stats_obj=False`.
+        """
+        # For better error message, catch the last key-path (reducing of which might
+        # throw an error).
+        PATH = None
+        def _reduce(path, stats):
+            nonlocal PATH
+            PATH = path
+            return stats.reduce()
+        # Create a shallow (yet nested) copy of `self.stats` in case we need to reset
+        # some of our stats due to this `reduce()` call and Stats having
+        # `self.clear_on_reduce=True`. In the latter case we would receive a new empty
+        # `Stats` object from `stat.reduce()` with the same settings as existing one and
+        # can now re-assign it to `self.stats[key]`, while we return from this method
+        # the properly reduced, but not cleared/emptied new `Stats`.
+        if key is not None:
+            stats_to_return = self._get_key(key, key_error=False)
+        else:
+            stats_to_return = self.stats
+        try:
+            with self._threading_lock:
+                assert (
+                    not self.tensor_mode
+                ), "Can't reduce if `self.tensor_mode` is True!"
+                reduced = copy.deepcopy(
+                    tree.map_structure_with_path(_reduce, stats_to_return)
+                )
+                if key is not None:
+                    self._set_key(key, reduced)
+                else:
+                    self.stats = reduced
+        # Provide proper error message if reduction fails due to bad data.
+        except Exception as e:
+            raise ValueError(
+                "There was an error while reducing the Stats object under key="
+                f"{PATH}! Check, whether you logged invalid or incompatible "
+                "values into this key over time in your custom code."
+                f"\nThe values under this key are: {self._get_key(PATH).values}."
+                f"\nThe original error was {str(e)}"
+            )
+        # Return (reduced) `Stats` objects as leafs.
+        if return_stats_obj:
+            return stats_to_return
+        # Return actual (reduced) values (not reduced `Stats` objects) as leafs.
+        else:
+            return self.peek_results(stats_to_return)
+    def activate_tensor_mode(self):
+        """Switches to tensor-mode, in which in-graph tensors can be logged.
+        Should be used before calling in-graph/copmiled functions, for example loss
+        functions. The user can then still call the `log_...` APIs, but each incoming
+        value will be checked for a) whether it is a tensor indeed and b) the `window`
+        args must be 1 (MetricsLogger does not support any tensor-framework reducing
+        operations).
+        When in tensor-mode, we also track all incoming `log_...` values and return
+        them TODO (sven) continue docstring
+        """
+        self._threading_lock.acquire()
+        assert not self.tensor_mode
+        self._tensor_mode = True
+    def deactivate_tensor_mode(self):
+        """Switches off tensor-mode."""
+        assert self.tensor_mode
+        self._tensor_mode = False
+        # Return all logged tensors (logged during the tensor-mode phase).
+        logged_tensors = {key: self._get_key(key).peek() for key in self._tensor_keys}
+        # Clear out logged tensor keys.
+        self._tensor_keys.clear()
+        return logged_tensors
+    def tensors_to_numpy(self, tensor_metrics):
+        """Converts all previously logged and returned tensors back to numpy values."""
+        for key, values in tensor_metrics.items():
+            assert self._key_in_stats(key)
+            self._get_key(key).set_to_numpy_values(values)
+        self._threading_lock.release()
+    @property
+    def tensor_mode(self):
+        return self._tensor_mode
+    def set_value(
+        self,
+        key: Union[str, Tuple[str, ...]],
+        value: Any,
+        *,
+        reduce: Optional[str] = "mean",
+        window: Optional[Union[int, float]] = None,
+        ema_coeff: Optional[float] = None,
+        clear_on_reduce: bool = False,
+        with_throughput: bool = False,
+    ) -> None:
+        """Overrides the logged values under `key` with `value`.
+        The internal values list under `key` is cleared and reset to [`value`]. If
+        `key` already exists, this method will NOT alter the reduce settings. Otherwise,
+        it will apply the provided reduce settings (`reduce`, `window`, `ema_coeff`,
+        and `clear_on_reduce`).
+        Args:
+            key: The key to override.
+            value: The new value to set the internal values list to (will be set to
+                a list containing a single item `value`).
+            reduce: The reduction method to apply, once `self.reduce()` is called.
+                If None, will collect all logged values under `key` in a list (and
+                also return that list upon calling `self.reduce()`).
+                Note that this is only applied if `key` does not exist in `self` yet.
+            window: An optional window size to reduce over.
+                If not None, then the reduction operation is only applied to the most
+                recent `window` items, and - after reduction - the internal values list
+                under `key` is shortened to hold at most `window` items (the most
+                recent ones).
+                Must be None if `ema_coeff` is provided.
+                If None (and `ema_coeff` is None), reduction must not be "mean".
+                Note that this is only applied if `key` does not exist in `self` yet.
+            ema_coeff: An optional EMA coefficient to use if `reduce` is "mean"
+                and no `window` is provided. Note that if both `window` and `ema_coeff`
+                are provided, an error is thrown. Also, if `ema_coeff` is provided,
+                `reduce` must be "mean".
+                The reduction formula for EMA is:
+                EMA(t1) = (1.0 - ema_coeff) * EMA(t0) + ema_coeff * new_value
+                Note that this is only applied if `key` does not exist in `self` yet.
+            clear_on_reduce: If True, all values under `key` will be emptied after
+                `self.reduce()` is called. Setting this to True is useful for cases,
+                in which the internal values list would otherwise grow indefinitely,
+                for example if reduce is None and there is no `window` provided.
+                Note that this is only applied if `key` does not exist in `self` yet.
+            with_throughput: Whether to track a throughput estimate together with this
+                metric. This is only supported for `reduce=sum` and
+                `clear_on_reduce=False` metrics (aka. "lifetime counts"). The `Stats`
+                object under the logged key then keeps track of the time passed
+                between two consecutive calls to `reduce()` and update its throughput
+                estimate. The current throughput estimate of a key can be obtained
+                through: peeked_value, throuthput_per_sec =
+                <MetricsLogger>.peek([key], throughput=True).
+        """
+        # Key already in self -> Erase internal values list with [`value`].
+        if self._key_in_stats(key):
+            stats = self._get_key(key)
+            with self._threading_lock:
+                stats.values = [value]
+        # Key cannot be found in `self` -> Simply log as a (new) value.
+        else:
+            self.log_value(
+                key,
+                value,
+                reduce=reduce,
+                window=window,
+                ema_coeff=ema_coeff,
+                clear_on_reduce=clear_on_reduce,
+                with_throughput=with_throughput,
+            )
+    def reset(self) -> None:
+        """Resets all data stored in this MetricsLogger.
+        .. testcode::
+            from ray.rllib.utils.metrics.metrics_logger import MetricsLogger
+            from ray.rllib.utils.test_utils import check
+            logger = MetricsLogger()
+            logger.log_value("a", 1.0)
+            check(logger.peek("a"), 1.0)
+            logger.reset()
+            check(logger.reduce(), {})
+        """
+        with self._threading_lock:
+            self.stats = {}
+            self._tensor_keys = set()
+    def delete(self, *key: Tuple[str, ...], key_error: bool = True) -> None:
+        """Deletes the given `key` from this metrics logger's stats.
+        Args:
+            key: The key or key sequence (for nested location within self.stats),
+                to delete from this MetricsLogger's stats.
+            key_error: Whether to throw a KeyError if `key` cannot be found in `self`.
+        Raises:
+            KeyError: If `key` cannot be found in `self` AND `key_error` is True.
+        """
+        self._del_key(key, key_error)
+    def get_state(self) -> Dict[str, Any]:
+        """Returns the current state of `self` as a dict.
+        Note that the state is merely the combination of all states of the individual
+        `Stats` objects stored under `self.stats`.
+        """
+        stats_dict = {}
+        def _map(path, stats):
+            # Convert keys to strings for msgpack-friendliness.
+            stats_dict["--".join(path)] = stats.get_state()
+        with self._threading_lock:
+            tree.map_structure_with_path(_map, self.stats)
+        return {"stats": stats_dict}
+    def set_state(self, state: Dict[str, Any]) -> None:
+        """Sets the state of `self` to the given `state`.
+        Args:
+            state: The state to set `self` to.
+        """
+        with self._threading_lock:
+            for flat_key, stats_state in state["stats"].items():
+                self._set_key(flat_key.split("--"), Stats.from_state(stats_state))
+    def _check_tensor(self, key: Tuple[str], value) -> None:
+        # `value` is a tensor -> Log it in our keys set.
+        if self.tensor_mode and (
+            (torch and torch.is_tensor(value)) or (tf and tf.is_tensor(value))
+        ):
+            self._tensor_keys.add(key)
+    def _key_in_stats(self, flat_key, *, stats=None):
+        flat_key = force_tuple(tree.flatten(flat_key))
+        _dict = stats if stats is not None else self.stats
+        for key in flat_key:
+            if key not in _dict:
+                return False
+            _dict = _dict[key]
+        return True
+    def _get_key(self, flat_key, *, stats=None, key_error=True):
+        flat_key = force_tuple(tree.flatten(flat_key))
+        _dict = stats if stats is not None else self.stats
+        for key in flat_key:
+            try:
+                _dict = _dict[key]
+            except KeyError as e:
+                if key_error:
+                    raise e
+                else:
+                    return {}
+        return _dict
+    def _set_key(self, flat_key, stats):
+        flat_key = force_tuple(tree.flatten(flat_key))
+        with self._threading_lock:
+            _dict = self.stats
+            for i, key in enumerate(flat_key):
+                # If we are at the end of the key sequence, set
+                # the key, no matter, whether it already exists or not.
+                if i == len(flat_key) - 1:
+                    _dict[key] = stats
+                    return
+                # If an intermediary key in the sequence is missing,
+                # add a sub-dict under this key.
+                if key not in _dict:
+                    _dict[key] = {}
+                _dict = _dict[key]
+    def _del_key(self, flat_key, key_error=False):
+        flat_key = force_tuple(tree.flatten(flat_key))
+        with self._threading_lock:
+            # Erase the tensor key as well, if applicable.
+            if flat_key in self._tensor_keys:
+                self._tensor_keys.discard(flat_key)
+            # Erase the key from the (nested) `self.stats` dict.
+            _dict = self.stats
+            try:
+                for i, key in enumerate(flat_key):
+                    if i == len(flat_key) - 1:
+                        del _dict[key]
+                        return
+                    _dict = _dict[key]
+            except KeyError as e:
+                if key_error:
+                    raise e
+class _DummyRLock:
+    def acquire(self, blocking=True, timeout=-1):
+        return True
+    def release(self):
+        pass
+    def __enter__(self):
+        return self
+    def __exit__(self, exc_type, exc_value, traceback):
+        pass

.venv/lib/python3.11/site-packages/ray/rllib/utils/metrics/stats.py ADDED Viewed

	@@ -0,0 +1,757 @@

+from collections import defaultdict, deque
+import time
+import threading
+from typing import Any, Callable, Dict, Optional, Tuple, Union
+import numpy as np
+from ray.rllib.utils import force_list
+from ray.rllib.utils.framework import try_import_tf, try_import_torch
+from ray.rllib.utils.numpy import convert_to_numpy
+_, tf, _ = try_import_tf()
+torch, _ = try_import_torch()
+class Stats:
+    """A container class holding a number of values and executing reductions over them.
+    The individual values in a Stats object may be of any type, for example python int
+    or float, numpy arrays, or more complex structured (tuple, dict) and are stored in
+    a list under `self.values`.
+    Stats can be used to store metrics of the same type over time, for example a loss
+    or a learning rate, and to reduce all stored values applying a certain reduction
+    mechanism (for example "mean" or "sum").
+    Available reduction mechanisms are:
+    - "mean" using EMA with a configurable EMA coefficient.
+    - "mean" using a sliding window (over the last n stored values).
+    - "max/min" with an optional sliding window (over the last n stored values).
+    - "sum" with an optional sliding window (over the last n stored values).
+    - None: Simply store all logged values to an ever-growing list.
+    Through the `reduce()` API, one of the above-mentioned reduction mechanisms will
+    be executed on `self.values`.
+    .. testcode::
+        import time
+        from ray.rllib.utils.metrics.stats import Stats
+        from ray.rllib.utils.test_utils import check
+        # By default, we reduce using EMA (with default coeff=0.01).
+        stats = Stats()  # use `ema_coeff` arg to change the coeff
+        stats.push(1.0)
+        stats.push(2.0)
+        # EMA formula used by Stats: t1 = (1.0 - ema_coeff) * t0 + ema_coeff * new_val
+        check(stats.peek(), 1.0 * (1.0 - 0.01) + 2.0 * 0.01)
+        # Here, we use a window over which to mean.
+        stats = Stats(window=2)
+        stats.push(1.0)
+        stats.push(2.0)
+        stats.push(3.0)
+        # Only mean over the last 2 items.
+        check(stats.peek(), 2.5)
+        # Here, we sum over the lifetime of the Stats object.
+        stats = Stats(reduce="sum")
+        stats.push(1)
+        check(stats.peek(), 1)
+        stats.push(2)
+        check(stats.peek(), 3)
+        stats.push(3)
+        check(stats.peek(), 6)
+        # So far, we have stored all values (1, 2, and 3).
+        check(stats.values, [1, 2, 3])
+        # Let's call the `reduce()` method to actually reduce these values
+        # to a single item of value=6:
+        stats = stats.reduce()
+        check(stats.peek(), 6)
+        check(stats.values, [6])
+        # "min" and "max" work analogous to "sum". But let's try with a `window` now:
+        stats = Stats(reduce="max", window=2)
+        stats.push(2)
+        check(stats.peek(), 2)
+        stats.push(3)
+        check(stats.peek(), 3)
+        stats.push(1)
+        check(stats.peek(), 3)
+        # However, when we push another value, the max thus-far (3) will go
+        # out of scope:
+        stats.push(-1)
+        check(stats.peek(), 1)  # now, 1 is the max
+        # So far, we have stored all values (2, 3, 1, and -1).
+        check(stats.values, [2, 3, 1, -1])
+        # Let's call the `reduce()` method to actually reduce these values
+        # to a list of the most recent 2 (window size) values:
+        stats = stats.reduce()
+        check(stats.peek(), 1)
+        check(stats.values, [1, -1])
+        # We can also choose to not reduce at all (reduce=None).
+        # With a `window` given, Stats will simply keep (and return) the last
+        # `window` items in the values list.
+        # Note that we have to explicitly set reduce to None (b/c default is "mean").
+        stats = Stats(reduce=None, window=3)
+        stats.push(-5)
+        stats.push(-4)
+        stats.push(-3)
+        stats.push(-2)
+        check(stats.peek(), [-4, -3, -2])  # `window` (3) most recent values
+        # We have not reduced yet (all values are still stored):
+        check(stats.values, [-5, -4, -3, -2])
+        # Let's reduce:
+        stats = stats.reduce()
+        check(stats.peek(), [-4, -3, -2])
+        # Values are now shortened to contain only the most recent `window` items.
+        check(stats.values, [-4, -3, -2])
+        # We can even use Stats to time stuff. Here we sum up 2 time deltas,
+        # measured using a convenient with-block:
+        stats = Stats(reduce="sum")
+        check(len(stats.values), 0)
+        # First delta measurement:
+        with stats:
+            time.sleep(1.0)
+        check(len(stats.values), 1)
+        assert 1.1 > stats.peek() > 0.9
+        # Second delta measurement:
+        with stats:
+            time.sleep(1.0)
+        assert 2.2 > stats.peek() > 1.8
+        # When calling `reduce()`, the internal values list gets cleaned up.
+        check(len(stats.values), 2)  # still both deltas in the values list
+        stats = stats.reduce()
+        check(len(stats.values), 1)  # got reduced to one value (the sum)
+        assert 2.2 > stats.values[0] > 1.8
+    """
+    def __init__(
+        self,
+        init_value: Optional[Any] = None,
+        reduce: Optional[str] = "mean",
+        window: Optional[Union[int, float]] = None,
+        ema_coeff: Optional[float] = None,
+        clear_on_reduce: bool = False,
+        on_exit: Optional[Callable] = None,
+        throughput: Union[bool, float] = False,
+    ):
+        """Initializes a Stats instance.
+        Args:
+            init_value: Optional initial value to be placed into `self.values`. If None,
+                `self.values` will start empty.
+            reduce: The name of the reduce method to be used. Allowed are "mean", "min",
+                "max", and "sum". Use None to apply no reduction method (leave
+                `self.values` as-is when reducing, except for shortening it to
+                `window`). Note that if both `reduce` and `window` are None, the user of
+                this Stats object needs to apply some caution over the values list not
+                growing infinitely.
+            window: An optional window size to reduce over.
+                If `window` is not None, then the reduction operation is only applied to
+                the most recent `windows` items, and - after reduction - the values list
+                is shortened to hold at most `window` items (the most recent ones).
+                Must be None if `ema_coeff` is not None.
+                If `window` is None (and `ema_coeff` is None), reduction must not be
+                "mean".
+                TODO (sven): Allow window=float("inf"), iff clear_on_reduce=True.
+                This would enable cases where we want to accumulate n data points (w/o
+                limitation, then average over these, then reset the data pool on reduce,
+                e.g. for evaluation env_runner stats, which should NOT use any window,
+                just like in the old API stack).
+            ema_coeff: An optional EMA coefficient to use if reduce is "mean"
+                and no `window` is provided. Note that if both `window` and `ema_coeff`
+                are provided, an error is thrown. Also, if `ema_coeff` is provided,
+                `reduce` must be "mean".
+                The reduction formula for EMA performed by Stats is:
+                EMA(t1) = (1.0 - ema_coeff) * EMA(t0) + ema_coeff * new_value
+            clear_on_reduce: If True, the Stats object will reset its entire values list
+                to an empty one after `self.reduce()` is called. However, it will then
+                return from the `self.reduce()` call a new Stats object with the
+                properly reduced (not completely emptied) new values. Setting this
+                to True is useful for cases, in which the internal values list would
+                otherwise grow indefinitely, for example if reduce is None and there
+                is no `window` provided.
+            throughput: If True, track a throughput estimate together with this
+                Stats. This is only supported for `reduce=sum` and
+                `clear_on_reduce=False` metrics (aka. "lifetime counts"). The `Stats`
+                then keeps track of the time passed between two consecutive calls to
+                `reduce()` and update its throughput estimate. The current throughput
+                estimate of a key can be obtained through:
+                `peeked_val, throughput_per_sec = Stats.peek([key], throughput=True)`.
+                If a float, track throughput and also set current throughput estimate
+                to the given value.
+        """
+        # Thus far, we only support mean, max, min, and sum.
+        if reduce not in [None, "mean", "min", "max", "sum"]:
+            raise ValueError("`reduce` must be one of `mean|min|max|sum` or None!")
+        # One or both window and ema_coeff must be None.
+        if window is not None and ema_coeff is not None:
+            raise ValueError("Only one of `window` or `ema_coeff` can be specified!")
+        # If `ema_coeff` is provided, `reduce` must be "mean".
+        if ema_coeff is not None and reduce != "mean":
+            raise ValueError(
+                "`ema_coeff` arg only allowed (not None) when `reduce=mean`!"
+            )
+        # If `window` is explicitly set to inf, `clear_on_reduce` must be True.
+        # Otherwise, we risk a memory leak.
+        if window == float("inf") and not clear_on_reduce:
+            raise ValueError(
+                "When using an infinite window (float('inf'), `clear_on_reduce` must "
+                "be set to True!"
+            )
+        # If reduce=mean AND window=ema_coeff=None, we use EMA by default with a coeff
+        # of 0.01 (we do NOT support infinite window sizes for mean as that would mean
+        # to keep data in the cache forever).
+        if reduce == "mean" and window is None and ema_coeff is None:
+            ema_coeff = 0.01
+        # The actual data in this Stats object.
+        self.values = force_list(init_value)
+        self._reduce_method = reduce
+        self._window = window
+        self._ema_coeff = ema_coeff
+        # Timing functionality (keep start times per thread).
+        self._start_times = defaultdict(lambda: None)
+        # Simply store ths flag for the user of this class.
+        self._clear_on_reduce = clear_on_reduce
+        # Code to execute when exiting a with-context.
+        self._on_exit = on_exit
+        # On each `.reduce()` call, we store the result of this call in hist[0] and the
+        # previous `reduce()` result in hist[1].
+        self._hist = deque([0, 0, 0], maxlen=3)
+        self._throughput = throughput if throughput is not True else 0.0
+        if self._throughput is not False:
+            assert self._reduce_method == "sum"
+            assert self._window in [None, float("inf")]
+            self._throughput_last_time = -1
+    def push(self, value) -> None:
+        """Appends a new value into the internal values list.
+        Args:
+            value: The value item to be appended to the internal values list
+                (`self.values`).
+        """
+        self.values.append(value)
+    def __enter__(self) -> "Stats":
+        """Called when entering a context (with which users can measure a time delta).
+        Returns:
+            This Stats instance (self), unless another thread has already entered (and
+            not exited yet), in which case a copy of `self` is returned. This way, the
+            second thread(s) cannot mess with the original Stat's (self) time-measuring.
+            This also means that only the first thread to __enter__ actually logs into
+            `self` and the following threads' measurements are discarded (logged into
+            a non-referenced shim-Stats object, which will simply be garbage collected).
+        """
+        # In case another thread already is measuring this Stats (timing), simply ignore
+        # the "enter request" and return a clone of `self`.
+        thread_id = threading.get_ident()
+        # assert self._start_times[thread_id] is None
+        self._start_times[thread_id] = time.perf_counter()
+        return self
+    def __exit__(self, exc_type, exc_value, tb) -> None:
+        """Called when exiting a context (with which users can measure a time delta)."""
+        thread_id = threading.get_ident()
+        assert self._start_times[thread_id] is not None
+        time_delta_s = time.perf_counter() - self._start_times[thread_id]
+        self.push(time_delta_s)
+        # Call the on_exit handler.
+        if self._on_exit:
+            self._on_exit(time_delta_s)
+        del self._start_times[thread_id]
+    def peek(self, *, previous: Optional[int] = None, throughput: bool = False) -> Any:
+        """Returns the result of reducing the internal values list.
+        Note that this method does NOT alter the internal values list in this process.
+        Thus, users can call this method to get an accurate look at the reduced value
+        given the current internal values list.
+        Args:
+            previous: If provided (int), returns that previously (reduced) result of
+                this `Stats` object, which was generated `previous` number of `reduce()`
+                calls ago). If None (default), returns the current (reduced) value.
+        Returns:
+            The result of reducing the internal values list (or the previously computed
+            reduced result, if `previous` is True).
+        """
+        # Return previously reduced value.
+        if previous is not None:
+            return self._hist[-abs(previous)]
+        # Return the last measured throughput.
+        elif throughput:
+            return self._throughput if self._throughput is not False else None
+        return self._reduced_values()[0]
+    def reduce(self) -> "Stats":
+        """Reduces the internal values list according to the constructor settings.
+        Thereby, the internal values list is changed (note that this is different from
+        `peek()`, where the internal list is NOT changed). See the docstring of this
+        class for details on the reduction logic applied to the values list, based on
+        the constructor settings, such as `window`, `reduce`, etc..
+        Returns:
+            Returns `self` (now reduced) if self._reduced_values is False.
+            Returns a new `Stats` object with an empty internal values list, but
+            otherwise the same constructor settings (window, reduce, etc..) as `self`.
+        """
+        reduced, values = self._reduced_values()
+        # Keep track and update underlying throughput metric.
+        if self._throughput is not False:
+            # Take the delta between the new (upcoming) reduced value and the most
+            # recently reduced value (one `reduce()` call ago).
+            delta_sum = reduced - self._hist[-1]
+            time_now = time.perf_counter()
+            # `delta_sum` may be < 0.0 if user overrides a metric through
+            # `.set_value()`.
+            if self._throughput_last_time == -1 or delta_sum < 0.0:
+                self._throughput = np.nan
+            else:
+                delta_time = time_now - self._throughput_last_time
+                assert delta_time >= 0.0
+                self._throughput = delta_sum / delta_time
+            self._throughput_last_time = time_now
+        # Reduce everything to a single (init) value.
+        self.values = values
+        # Shift historic reduced valued by one in our hist-tuple.
+        self._hist.append(reduced)
+        # `clear_on_reduce` -> Return an empty new Stats object with the same settings
+        # as `self`.
+        if self._clear_on_reduce:
+            return Stats.similar_to(self)
+        # No reset required upon `reduce()` -> Return `self`.
+        else:
+            return self
+    def merge_on_time_axis(self, other: "Stats") -> None:
+        # Make sure `others` have same reduction settings.
+        assert self._reduce_method == other._reduce_method
+        assert self._window == other._window
+        assert self._ema_coeff == other._ema_coeff
+        # Extend `self`'s values by `other`'s.
+        self.values.extend(other.values)
+        # Slice by window size, if provided.
+        if self._window not in [None, float("inf")]:
+            self.values = self.values[-self._window :]
+        # Adopt `other`'s current throughput estimate (it's the newer one).
+        if self._throughput is not False:
+            self._throughput = other._throughput
+    def merge_in_parallel(self, *others: "Stats") -> None:
+        """Merges all internal values of `others` into `self`'s internal values list.
+        Thereby, the newly incoming values of `others` are treated equally with respect
+        to each other as well as with respect to the internal values of self.
+        Use this method to merge other `Stats` objects, which resulted from some
+        parallelly executed components, into this one. For example: n Learner workers
+        all returning a loss value in the form of `{"total_loss": [some value]}`.
+        The following examples demonstrate the parallel merging logic for different
+        reduce- and window settings:
+        .. testcode::
+            from ray.rllib.utils.metrics.stats import Stats
+            from ray.rllib.utils.test_utils import check
+            # Parallel-merge two (reduce=mean) stats with window=3.
+            stats = Stats(reduce="mean", window=3)
+            stats1 = Stats(reduce="mean", window=3)
+            stats1.push(0)
+            stats1.push(1)
+            stats1.push(2)
+            stats1.push(3)
+            stats2 = Stats(reduce="mean", window=3)
+            stats2.push(4000)
+            stats2.push(4)
+            stats2.push(5)
+            stats2.push(6)
+            stats.merge_in_parallel(stats1, stats2)
+            # Fill new merged-values list:
+            # - Start with index -1, moving to the start.
+            # - Thereby always reducing across the different Stats objects' at the
+            #   current index.
+            # - The resulting reduced value (across Stats at current index) is then
+            #   repeated AND added to the new merged-values list n times (where n is
+            #   the number of Stats, across which we merge).
+            # - The merged-values list is reversed.
+            # Here:
+            # index -1: [3, 6] -> [4.5, 4.5]
+            # index -2: [2, 5] -> [4.5, 4.5, 3.5, 3.5]
+            # STOP after merged list contains >= 3 items (window size)
+            # reverse: [3.5, 3.5, 4.5, 4.5]
+            check(stats.values, [3.5, 3.5, 4.5, 4.5])
+            check(stats.peek(), (3.5 + 4.5 + 4.5) / 3)  # mean last 3 items (window)
+            # Parallel-merge two (reduce=max) stats with window=3.
+            stats = Stats(reduce="max", window=3)
+            stats1 = Stats(reduce="max", window=3)
+            stats1.push(1)
+            stats1.push(2)
+            stats1.push(3)
+            stats2 = Stats(reduce="max", window=3)
+            stats2.push(4)
+            stats2.push(5)
+            stats2.push(6)
+            stats.merge_in_parallel(stats1, stats2)
+            # Same here: Fill new merged-values list:
+            # - Start with index -1, moving to the start.
+            # - Thereby always reduce across the different Stats objects' at the
+            #   current index.
+            # - The resulting reduced value (across Stats at current index) is then
+            #   repeated AND added to the new merged-values list n times (where n is the
+            #   number of Stats, across which we merge).
+            # - The merged-values list is reversed.
+            # Here:
+            # index -1: [3, 6] -> [6, 6]
+            # index -2: [2, 5] -> [6, 6, 5, 5]
+            # STOP after merged list contains >= 3 items (window size)
+            # reverse: [5, 5, 6, 6]
+            check(stats.values, [5, 5, 6, 6])
+            check(stats.peek(), 6)  # max is 6
+            # Parallel-merge two (reduce=min) stats with window=4.
+            stats = Stats(reduce="min", window=4)
+            stats1 = Stats(reduce="min", window=4)
+            stats1.push(1)
+            stats1.push(2)
+            stats1.push(1)
+            stats1.push(4)
+            stats2 = Stats(reduce="min", window=4)
+            stats2.push(5)
+            stats2.push(0.5)
+            stats2.push(7)
+            stats2.push(8)
+            stats.merge_in_parallel(stats1, stats2)
+            # Same procedure:
+            # index -1: [4, 8] -> [4, 4]
+            # index -2: [1, 7] -> [4, 4, 1, 1]
+            # STOP after merged list contains >= 4 items (window size)
+            # reverse: [1, 1, 4, 4]
+            check(stats.values, [1, 1, 4, 4])
+            check(stats.peek(), 1)  # min is 1
+            # Parallel-merge two (reduce=sum) stats with no window.
+            # Note that when reduce="sum", we do NOT reduce across the indices of the
+            # parallel values.
+            stats = Stats(reduce="sum")
+            stats1 = Stats(reduce="sum")
+            stats1.push(1)
+            stats1.push(2)
+            stats1.push(0)
+            stats1.push(3)
+            stats2 = Stats(reduce="sum")
+            stats2.push(4)
+            stats2.push(5)
+            stats2.push(6)
+            # index -1: [3, 6] -> [3, 6] (no reduction, leave values as-is)
+            # index -2: [0, 5] -> [3, 6, 0, 5]
+            # index -3: [2, 4] -> [3, 6, 0, 5, 2, 4]
+            # index -4: [1] -> [3, 6, 0, 5, 2, 4, 1]
+            # reverse: [1, 4, 2, 5, 0, 6, 3]
+            stats.merge_in_parallel(stats1, stats2)
+            check(stats.values, [1, 4, 2, 5, 0, 6, 3])
+            check(stats.peek(), 21)
+            # Parallel-merge two "concat" (reduce=None) stats with no window.
+            # Note that when reduce=None, we do NOT reduce across the indices of the
+            # parallel values.
+            stats = Stats(reduce=None, window=float("inf"), clear_on_reduce=True)
+            stats1 = Stats(reduce=None, window=float("inf"), clear_on_reduce=True)
+            stats1.push(1)
+            stats2 = Stats(reduce=None, window=float("inf"), clear_on_reduce=True)
+            stats2.push(2)
+            # index -1: [1, 2] -> [1, 2] (no reduction, leave values as-is)
+            # reverse: [2, 1]
+            stats.merge_in_parallel(stats1, stats2)
+            check(stats.values, [2, 1])
+            check(stats.peek(), [2, 1])
+        Args:
+            others: One or more other Stats objects that need to be parallely merged
+                into `self, meaning with equal weighting as the existing values in
+                `self`.
+        """
+        # Make sure `others` have same reduction settings.
+        assert all(
+            self._reduce_method == o._reduce_method
+            and self._window == o._window
+            and self._ema_coeff == o._ema_coeff
+            for o in others
+        )
+        win = self._window or float("inf")
+        # Take turns stepping through `self` and `*others` values, thereby moving
+        # backwards from last index to beginning and will up the resulting values list.
+        # Stop as soon as we reach the window size.
+        new_values = []
+        tmp_values = []
+        # Loop from index=-1 backward to index=start until our new_values list has
+        # at least a len of `win`.
+        for i in range(1, max(map(len, [self, *others])) + 1):
+            # Per index, loop through all involved stats, including `self` and add
+            # to `tmp_values`.
+            for stats in [self, *others]:
+                if len(stats) < i:
+                    continue
+                tmp_values.append(stats.values[-i])
+            # Now reduce across `tmp_values` based on the reduce-settings of this Stats.
+            # TODO (sven) : explain why all this
+            if self._ema_coeff is not None:
+                new_values.extend([np.nanmean(tmp_values)] * len(tmp_values))
+            elif self._reduce_method in [None, "sum"]:
+                new_values.extend(tmp_values)
+            else:
+                new_values.extend(
+                    [self._reduced_values(values=tmp_values, window=float("inf"))[0]]
+                    * len(tmp_values)
+                )
+            tmp_values.clear()
+            if len(new_values) >= win:
+                break
+        self.values = list(reversed(new_values))
+    def set_to_numpy_values(self, values) -> None:
+        """Converts `self.values` from tensors to actual numpy values.
+        Args:
+            values: The (numpy) values to set `self.values` to.
+        """
+        numpy_values = convert_to_numpy(values)
+        if self._reduce_method is None:
+            assert isinstance(values, list) and len(self.values) >= len(values)
+            self.values = numpy_values
+        else:
+            assert len(self.values) > 0
+            self.values = [numpy_values]
+    def __len__(self) -> int:
+        """Returns the length of the internal values list."""
+        return len(self.values)
+    def __repr__(self) -> str:
+        win_or_ema = (
+            f"; win={self._window}"
+            if self._window
+            else f"; ema={self._ema_coeff}"
+            if self._ema_coeff
+            else ""
+        )
+        return (
+            f"Stats({self.peek()}; len={len(self)}; "
+            f"reduce={self._reduce_method}{win_or_ema})"
+        )
+    def __int__(self):
+        return int(self.peek())
+    def __float__(self):
+        return float(self.peek())
+    def __eq__(self, other):
+        return float(self) == float(other)
+    def __le__(self, other):
+        return float(self) <= float(other)
+    def __ge__(self, other):
+        return float(self) >= float(other)
+    def __lt__(self, other):
+        return float(self) < float(other)
+    def __gt__(self, other):
+        return float(self) > float(other)
+    def __add__(self, other):
+        return float(self) + float(other)
+    def __sub__(self, other):
+        return float(self) - float(other)
+    def __mul__(self, other):
+        return float(self) * float(other)
+    def __format__(self, fmt):
+        return f"{float(self):{fmt}}"
+    def get_state(self) -> Dict[str, Any]:
+        return {
+            "values": convert_to_numpy(self.values),
+            "reduce": self._reduce_method,
+            "window": self._window,
+            "ema_coeff": self._ema_coeff,
+            "clear_on_reduce": self._clear_on_reduce,
+            "_hist": list(self._hist),
+        }
+    @staticmethod
+    def from_state(state: Dict[str, Any]) -> "Stats":
+        stats = Stats(
+            state["values"],
+            reduce=state["reduce"],
+            window=state["window"],
+            ema_coeff=state["ema_coeff"],
+            clear_on_reduce=state["clear_on_reduce"],
+        )
+        stats._hist = deque(state["_hist"], maxlen=stats._hist.maxlen)
+        return stats
+    @staticmethod
+    def similar_to(
+        other: "Stats",
+        init_value: Optional[Any] = None,
+    ) -> "Stats":
+        """Returns a new Stats object that's similar to `other`.
+        "Similar" here means it has the exact same settings (reduce, window, ema_coeff,
+        etc..). The initial values of the returned `Stats` are empty by default, but
+        can be set as well.
+        Args:
+            other: The other Stats object to return a similar new Stats equivalent for.
+            init_value: The initial value to already push into the returned Stats. If
+                None (default), the returned Stats object will have no values in it.
+        Returns:
+            A new Stats object similar to `other`, with the exact same settings and
+            maybe a custom initial value (if provided; otherwise empty).
+        """
+        stats = Stats(
+            init_value=init_value,
+            reduce=other._reduce_method,
+            window=other._window,
+            ema_coeff=other._ema_coeff,
+            clear_on_reduce=other._clear_on_reduce,
+            throughput=other._throughput,
+        )
+        stats._hist = other._hist
+        return stats
+    def _reduced_values(self, values=None, window=None) -> Tuple[Any, Any]:
+        """Runs a non-commited reduction procedure on given values (or `self.values`).
+        Note that this method does NOT alter any state of `self` or the possibly
+        provided list of `values`. It only returns new values as they should be
+        adopted after a possible, actual reduction step.
+        Args:
+            values: The list of values to reduce. If not None, use `self.values`
+            window: A possible override window setting to use (instead of
+                `self._window`). Use float('inf') here for an infinite window size.
+        Returns:
+            A tuple containing 1) the reduced value and 2) the new internal values list
+            to be used.
+        """
+        values = values if values is not None else self.values
+        window = window if window is not None else self._window
+        inf_window = window in [None, float("inf")]
+        # Apply the window (if provided and not inf).
+        values = values if inf_window else values[-window:]
+        # No reduction method. Return list as-is OR reduce list to len=window.
+        if self._reduce_method is None:
+            return values, values
+        # Special case: Internal values list is empty -> return NaN or 0.0 for sum.
+        elif len(values) == 0:
+            if self._reduce_method in ["min", "max", "mean"]:
+                return float("nan"), []
+            else:
+                return 0, []
+        # Do EMA (always a "mean" reduction; possibly using a window).
+        elif self._ema_coeff is not None:
+            # Perform EMA reduction over all values in internal values list.
+            mean_value = values[0]
+            for v in values[1:]:
+                mean_value = self._ema_coeff * v + (1.0 - self._ema_coeff) * mean_value
+            if inf_window:
+                return mean_value, [mean_value]
+            else:
+                return mean_value, values
+        # Do non-EMA reduction (possibly using a window).
+        else:
+            # Use the numpy/torch "nan"-prefix to ignore NaN's in our value lists.
+            if torch and torch.is_tensor(values[0]):
+                assert all(torch.is_tensor(v) for v in values), values
+                # TODO (sven) If the shape is (), do NOT even use the reduce method.
+                #  Using `tf.reduce_mean()` here actually lead to a completely broken
+                #  DreamerV3 (for a still unknown exact reason).
+                if len(values[0].shape) == 0:
+                    reduced = values[0]
+                else:
+                    reduce_meth = getattr(torch, "nan" + self._reduce_method)
+                    reduce_in = torch.stack(values)
+                    if self._reduce_method == "mean":
+                        reduce_in = reduce_in.float()
+                    reduced = reduce_meth(reduce_in)
+            elif tf and tf.is_tensor(values[0]):
+                # TODO (sven): Currently, tensor metrics only work with window=1.
+                #  We might want o enforce it more formally, b/c it's probably not a
+                #  good idea to have MetricsLogger or Stats tinker with the actual
+                #  computation graph that users are trying to build in their loss
+                #  functions.
+                assert len(values) == 1
+                # TODO (sven) If the shape is (), do NOT even use the reduce method.
+                #  Using `tf.reduce_mean()` here actually lead to a completely broken
+                #  DreamerV3 (for a still unknown exact reason).
+                if len(values[0].shape) == 0:
+                    reduced = values[0]
+                else:
+                    reduce_meth = getattr(tf, "reduce_" + self._reduce_method)
+                    reduced = reduce_meth(values)
+            else:
+                reduce_meth = getattr(np, "nan" + self._reduce_method)
+                reduced = reduce_meth(values)
+            # Convert from numpy to primitive python types, if original `values` are
+            # python types.
+            if reduced.shape == () and isinstance(values[0], (int, float)):
+                if reduced.dtype in [np.int32, np.int64, np.int8, np.int16]:
+                    reduced = int(reduced)
+                else:
+                    reduced = float(reduced)
+            # For window=None|inf (infinite window) and reduce != mean, we don't have to
+            # keep any values, except the last (reduced) one.
+            if inf_window and self._reduce_method != "mean":
+                # TODO (sven): What if values are torch tensors? In this case, we
+                #  would have to do reduction using `torch` above (not numpy) and only
+                #  then return the python primitive AND put the reduced new torch
+                #  tensor in the new `self.values`.
+                return reduced, [reduced]
+            # In all other cases, keep the values that were also used for the reduce
+            # operation.
+            else:
+                return reduced, values

.venv/lib/python3.11/site-packages/ray/rllib/utils/metrics/window_stat.py ADDED Viewed

	@@ -0,0 +1,79 @@

+import numpy as np
+from ray.rllib.utils.annotations import OldAPIStack
+@OldAPIStack
+class WindowStat:
+    """Handles/stores incoming dataset and provides window-based statistics.
+    .. testcode::
+        :skipif: True
+        win_stats = WindowStat("level", 3)
+        win_stats.push(5.0)
+        win_stats.push(7.0)
+        win_stats.push(7.0)
+        win_stats.push(10.0)
+        # Expect 8.0 as the mean of the last 3 values: (7+7+10)/3=8.0
+        print(win_stats.mean())
+    .. testoutput::
+        8.0
+    """
+    def __init__(self, name: str, n: int):
+        """Initializes a WindowStat instance.
+        Args:
+            name: The name of the stats to collect and return stats for.
+            n: The window size. Statistics will be computed for the last n
+                items received from the stream.
+        """
+        # The window-size.
+        self.window_size = n
+        # The name of the data (used for `self.stats()`).
+        self.name = name
+        # List of items to do calculations over (len=self.n).
+        self.items = [None] * self.window_size
+        # The current index to insert the next item into `self.items`.
+        self.idx = 0
+        # How many items have been added over the lifetime of this object.
+        self.count = 0
+    def push(self, obj) -> None:
+        """Pushes a new value/object into the data buffer."""
+        # Insert object at current index.
+        self.items[self.idx] = obj
+        # Increase insertion index by 1.
+        self.idx += 1
+        # Increase lifetime count by 1.
+        self.count += 1
+        # Fix index in case of rollover.
+        self.idx %= len(self.items)
+    def mean(self) -> float:
+        """Returns the (NaN-)mean of the last `self.window_size` items."""
+        return float(np.nanmean(self.items[: self.count]))
+    def std(self) -> float:
+        """Returns the (NaN)-stddev of the last `self.window_size` items."""
+        return float(np.nanstd(self.items[: self.count]))
+    def quantiles(self) -> np.ndarray:
+        """Returns ndarray with 0, 10, 50, 90, and 100 percentiles."""
+        if not self.count:
+            return np.ndarray([], dtype=np.float32)
+        else:
+            return np.nanpercentile(
+                self.items[: self.count], [0, 10, 50, 90, 100]
+            ).tolist()
+    def stats(self):
+        return {
+            self.name + "_count": int(self.count),
+            self.name + "_mean": self.mean(),
+            self.name + "_std": self.std(),
+            self.name + "_quantiles": self.quantiles(),
+        }

.venv/lib/python3.11/site-packages/ray/rllib/utils/replay_buffers/__init__.py ADDED Viewed

	@@ -0,0 +1,44 @@

+from ray.rllib.utils.replay_buffers.episode_replay_buffer import EpisodeReplayBuffer
+from ray.rllib.utils.replay_buffers.fifo_replay_buffer import FifoReplayBuffer
+from ray.rllib.utils.replay_buffers.multi_agent_mixin_replay_buffer import (
+    MultiAgentMixInReplayBuffer,
+)
+from ray.rllib.utils.replay_buffers.multi_agent_episode_buffer import (
+    MultiAgentEpisodeReplayBuffer,
+)
+from ray.rllib.utils.replay_buffers.multi_agent_prioritized_episode_buffer import (
+    MultiAgentPrioritizedEpisodeReplayBuffer,
+)
+from ray.rllib.utils.replay_buffers.multi_agent_prioritized_replay_buffer import (
+    MultiAgentPrioritizedReplayBuffer,
+)
+from ray.rllib.utils.replay_buffers.multi_agent_replay_buffer import (
+    MultiAgentReplayBuffer,
+    ReplayMode,
+)
+from ray.rllib.utils.replay_buffers.prioritized_episode_buffer import (
+    PrioritizedEpisodeReplayBuffer,
+)
+from ray.rllib.utils.replay_buffers.prioritized_replay_buffer import (
+    PrioritizedReplayBuffer,
+)
+from ray.rllib.utils.replay_buffers.replay_buffer import ReplayBuffer, StorageUnit
+from ray.rllib.utils.replay_buffers.reservoir_replay_buffer import ReservoirReplayBuffer
+from ray.rllib.utils.replay_buffers import utils
+__all__ = [
+    "EpisodeReplayBuffer",
+    "FifoReplayBuffer",
+    "MultiAgentEpisodeReplayBuffer",
+    "MultiAgentMixInReplayBuffer",
+    "MultiAgentPrioritizedEpisodeReplayBuffer",
+    "MultiAgentPrioritizedReplayBuffer",
+    "MultiAgentReplayBuffer",
+    "PrioritizedEpisodeReplayBuffer",
+    "PrioritizedReplayBuffer",
+    "ReplayMode",
+    "ReplayBuffer",
+    "ReservoirReplayBuffer",
+    "StorageUnit",
+    "utils",
+]

.venv/lib/python3.11/site-packages/ray/rllib/utils/replay_buffers/__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (1.87 kB). View file

.venv/lib/python3.11/site-packages/ray/rllib/utils/replay_buffers/__pycache__/base.cpython-311.pyc ADDED Viewed

Binary file (3.77 kB). View file

.venv/lib/python3.11/site-packages/ray/rllib/utils/replay_buffers/__pycache__/episode_replay_buffer.cpython-311.pyc ADDED Viewed

Binary file (43.2 kB). View file

.venv/lib/python3.11/site-packages/ray/rllib/utils/replay_buffers/__pycache__/fifo_replay_buffer.cpython-311.pyc ADDED Viewed

Binary file (5.1 kB). View file

.venv/lib/python3.11/site-packages/ray/rllib/utils/replay_buffers/__pycache__/multi_agent_episode_buffer.cpython-311.pyc ADDED Viewed

Binary file (44.1 kB). View file

.venv/lib/python3.11/site-packages/ray/rllib/utils/replay_buffers/__pycache__/multi_agent_mixin_replay_buffer.cpython-311.pyc ADDED Viewed

Binary file (18.8 kB). View file

.venv/lib/python3.11/site-packages/ray/rllib/utils/replay_buffers/__pycache__/multi_agent_prioritized_episode_buffer.cpython-311.pyc ADDED Viewed

Binary file (39.4 kB). View file

.venv/lib/python3.11/site-packages/ray/rllib/utils/replay_buffers/__pycache__/multi_agent_prioritized_replay_buffer.cpython-311.pyc ADDED Viewed

Binary file (13.4 kB). View file

.venv/lib/python3.11/site-packages/ray/rllib/utils/replay_buffers/__pycache__/multi_agent_replay_buffer.cpython-311.pyc ADDED Viewed

Binary file (20.1 kB). View file

.venv/lib/python3.11/site-packages/ray/rllib/utils/replay_buffers/__pycache__/prioritized_episode_buffer.cpython-311.pyc ADDED Viewed

Binary file (30.2 kB). View file

.venv/lib/python3.11/site-packages/ray/rllib/utils/replay_buffers/__pycache__/prioritized_replay_buffer.cpython-311.pyc ADDED Viewed

Binary file (12.3 kB). View file

.venv/lib/python3.11/site-packages/ray/rllib/utils/replay_buffers/__pycache__/replay_buffer.cpython-311.pyc ADDED Viewed

Binary file (17.2 kB). View file

.venv/lib/python3.11/site-packages/ray/rllib/utils/replay_buffers/__pycache__/reservoir_replay_buffer.cpython-311.pyc ADDED Viewed

Binary file (6.34 kB). View file

.venv/lib/python3.11/site-packages/ray/rllib/utils/replay_buffers/__pycache__/simple_replay_buffer.cpython-311.pyc ADDED Viewed

Binary file (215 Bytes). View file

.venv/lib/python3.11/site-packages/ray/rllib/utils/replay_buffers/__pycache__/utils.cpython-311.pyc ADDED Viewed

Binary file (17.6 kB). View file

.venv/lib/python3.11/site-packages/ray/rllib/utils/replay_buffers/base.py ADDED Viewed

	@@ -0,0 +1,76 @@

+from abc import ABCMeta, abstractmethod
+import platform
+from typing import Any, Dict, Optional
+from ray.util.annotations import DeveloperAPI
+@DeveloperAPI
+class ReplayBufferInterface(metaclass=ABCMeta):
+    """Abstract base class for all of RLlib's replay buffers.
+    Mainly defines the `add()` and `sample()` methods that every buffer class
+    must implement to be usable by an Algorithm.
+    Buffers may determine on all the implementation details themselves, e.g.
+    whether to store single timesteps, episodes, or episode fragments or whether
+    to return fixed batch sizes or per-call defined ones.
+    """
+    @abstractmethod
+    @DeveloperAPI
+    def __len__(self) -> int:
+        """Returns the number of items currently stored in this buffer."""
+    @abstractmethod
+    @DeveloperAPI
+    def add(self, batch: Any, **kwargs) -> None:
+        """Adds a batch of experiences or other data to this buffer.
+        Args:
+            batch: Batch or data to add.
+            ``**kwargs``: Forward compatibility kwargs.
+        """
+    @abstractmethod
+    @DeveloperAPI
+    def sample(self, num_items: Optional[int] = None, **kwargs) -> Any:
+        """Samples `num_items` items from this buffer.
+        The exact shape of the returned data depends on the buffer's implementation.
+        Args:
+            num_items: Number of items to sample from this buffer.
+            ``**kwargs``: Forward compatibility kwargs.
+        Returns:
+            A batch of items.
+        """
+    @abstractmethod
+    @DeveloperAPI
+    def get_state(self) -> Dict[str, Any]:
+        """Returns all local state in a dict.
+        Returns:
+            The serializable local state.
+        """
+    @abstractmethod
+    @DeveloperAPI
+    def set_state(self, state: Dict[str, Any]) -> None:
+        """Restores all local state to the provided `state`.
+        Args:
+            state: The new state to set this buffer. Can be obtained by calling
+                `self.get_state()`.
+        """
+    @DeveloperAPI
+    def get_host(self) -> str:
+        """Returns the computer's network name.
+        Returns:
+            The computer's networks name or an empty string, if the network
+            name could not be determined.
+        """
+        return platform.node()

.venv/lib/python3.11/site-packages/ray/rllib/utils/replay_buffers/episode_replay_buffer.py ADDED Viewed

	@@ -0,0 +1,1098 @@

+from collections import deque
+import copy
+import hashlib
+from typing import Any, Dict, List, Optional, Tuple, Union
+import numpy as np
+import scipy
+from ray.rllib.core import DEFAULT_AGENT_ID
+from ray.rllib.env.single_agent_episode import SingleAgentEpisode
+from ray.rllib.env.utils.infinite_lookback_buffer import InfiniteLookbackBuffer
+from ray.rllib.utils import force_list
+from ray.rllib.utils.annotations import (
+    override,
+    OverrideToImplementCustomLogic_CallToSuperRecommended,
+)
+from ray.rllib.utils.metrics import (
+    ACTUAL_N_STEP,
+    AGENT_ACTUAL_N_STEP,
+    AGENT_STEP_UTILIZATION,
+    ENV_STEP_UTILIZATION,
+    NUM_AGENT_EPISODES_STORED,
+    NUM_AGENT_EPISODES_ADDED,
+    NUM_AGENT_EPISODES_ADDED_LIFETIME,
+    NUM_AGENT_EPISODES_EVICTED,
+    NUM_AGENT_EPISODES_EVICTED_LIFETIME,
+    NUM_AGENT_EPISODES_PER_SAMPLE,
+    NUM_AGENT_STEPS_STORED,
+    NUM_AGENT_STEPS_ADDED,
+    NUM_AGENT_STEPS_ADDED_LIFETIME,
+    NUM_AGENT_STEPS_EVICTED,
+    NUM_AGENT_STEPS_EVICTED_LIFETIME,
+    NUM_AGENT_STEPS_PER_SAMPLE,
+    NUM_AGENT_STEPS_PER_SAMPLE_LIFETIME,
+    NUM_AGENT_STEPS_SAMPLED,
+    NUM_AGENT_STEPS_SAMPLED_LIFETIME,
+    NUM_ENV_STEPS_STORED,
+    NUM_ENV_STEPS_ADDED,
+    NUM_ENV_STEPS_ADDED_LIFETIME,
+    NUM_ENV_STEPS_EVICTED,
+    NUM_ENV_STEPS_EVICTED_LIFETIME,
+    NUM_ENV_STEPS_PER_SAMPLE,
+    NUM_ENV_STEPS_PER_SAMPLE_LIFETIME,
+    NUM_ENV_STEPS_SAMPLED,
+    NUM_ENV_STEPS_SAMPLED_LIFETIME,
+    NUM_EPISODES_STORED,
+    NUM_EPISODES_ADDED,
+    NUM_EPISODES_ADDED_LIFETIME,
+    NUM_EPISODES_EVICTED,
+    NUM_EPISODES_EVICTED_LIFETIME,
+    NUM_EPISODES_PER_SAMPLE,
+)
+from ray.rllib.utils.metrics.metrics_logger import MetricsLogger
+from ray.rllib.utils.replay_buffers.base import ReplayBufferInterface
+from ray.rllib.utils.typing import SampleBatchType, ResultDict
+class EpisodeReplayBuffer(ReplayBufferInterface):
+    """Buffer that stores (completed or truncated) episodes by their ID.
+    Each "row" (a slot in a deque) in the buffer is occupied by one episode. If an
+    incomplete episode is added to the buffer and then another chunk of that episode is
+    added at a later time, the buffer will automatically concatenate the new fragment to
+    the original episode. This way, episodes can be completed via subsequent `add`
+    calls.
+    Sampling returns batches of size B (number of "rows"), where each row is a
+    trajectory of length T. Each trajectory contains consecutive timesteps from an
+    episode, but might not start at the beginning of that episode. Should an episode end
+    within such a trajectory, a random next episode (starting from its t0) will be
+    concatenated to that "row". Example: `sample(B=2, T=4)` ->
+       0 .. 1 .. 2 .. 3  <- T-axis
+    0 e5   e6   e7   e8
+    1 f2   f3   h0   h2
+    ^ B-axis
+    .. where e, f, and h are different (randomly picked) episodes, the 0-index (e.g. h0)
+    indicates the start of an episode, and `f3` is an episode end (gym environment
+    returned terminated=True or truncated=True).
+    0-indexed returned timesteps contain the reset observation, a dummy 0.0 reward, as
+    well as the first action taken in the episode (action picked after observing
+    obs(0)).
+    The last index in an episode (e.g. f3 in the example above) contains the final
+    observation of the episode, the final reward received, a dummy action
+    (repeat the previous action), as well as either terminated=True or truncated=True.
+    """
+    __slots__ = (
+        "capacity",
+        "batch_size_B",
+        "batch_length_T",
+        "episodes",
+        "episode_id_to_index",
+        "num_episodes_evicted",
+        "_indices",
+        "_num_timesteps",
+        "_num_timesteps_added",
+        "sampled_timesteps",
+        "rng",
+    )
+    def __init__(
+        self,
+        capacity: int = 10000,
+        *,
+        batch_size_B: int = 16,
+        batch_length_T: int = 64,
+        metrics_num_episodes_for_smoothing: int = 100,
+    ):
+        """Initializes an EpisodeReplayBuffer instance.
+        Args:
+            capacity: The total number of timesteps to be storable in this buffer.
+                Will start ejecting old episodes once this limit is reached.
+            batch_size_B: The number of rows in a SampleBatch returned from `sample()`.
+            batch_length_T: The length of each row in a SampleBatch returned from
+                `sample()`.
+        """
+        self.capacity = capacity
+        self.batch_size_B = batch_size_B
+        self.batch_length_T = batch_length_T
+        # The actual episode buffer. We are using a deque here for faster insertion
+        # (left side) and eviction (right side) of data.
+        self.episodes = deque()
+        # Maps (unique) episode IDs to the index under which to find this episode
+        # within our `self.episodes` deque.
+        # Note that even after eviction started, the indices in here will NOT be
+        # changed. We will therefore need to offset all indices in
+        # `self.episode_id_to_index` by the number of episodes that have already been
+        # evicted (self._num_episodes_evicted) in order to get the actual index to use
+        # on `self.episodes`.
+        self.episode_id_to_index = {}
+        # The number of episodes that have already been evicted from the buffer
+        # due to reaching capacity.
+        self._num_episodes_evicted = 0
+        # List storing all index tuples: (eps_idx, ts_in_eps_idx), where ...
+        # `eps_idx - self._num_episodes_evicted' is the index into self.episodes.
+        # `ts_in_eps_idx` is the timestep index within that episode
+        #  (0 = 1st timestep, etc..).
+        # We sample uniformly from the set of these indices in a `sample()`
+        # call.
+        self._indices = []
+        # The size of the buffer in timesteps.
+        self._num_timesteps = 0
+        # The number of timesteps added thus far.
+        self._num_timesteps_added = 0
+        # How many timesteps have been sampled from the buffer in total?
+        self.sampled_timesteps = 0
+        self.rng = np.random.default_rng(seed=None)
+        # Initialize the metrics.
+        self.metrics = MetricsLogger()
+        self._metrics_num_episodes_for_smoothing = metrics_num_episodes_for_smoothing
+    @override(ReplayBufferInterface)
+    def __len__(self) -> int:
+        return self.get_num_timesteps()
+    @override(ReplayBufferInterface)
+    def add(self, episodes: Union[List["SingleAgentEpisode"], "SingleAgentEpisode"]):
+        """Converts incoming SampleBatch into a number of SingleAgentEpisode objects.
+        Then adds these episodes to the internal deque.
+        """
+        episodes = force_list(episodes)
+        # Set up some counters for metrics.
+        num_env_steps_added = 0
+        num_episodes_added = 0
+        num_episodes_evicted = 0
+        num_env_steps_evicted = 0
+        for eps in episodes:
+            # Make sure we don't change what's coming in from the user.
+            # TODO (sven): It'd probably be better to make sure in the EnvRunner to not
+            #  hold on to episodes (for metrics purposes only) that we are returning
+            #  back to the user from `EnvRunner.sample()`. Then we wouldn't have to
+            #  do any copying. Instead, either compile the metrics right away on the
+            #  EnvRunner OR compile metrics entirely on the Algorithm side (this is
+            #  actually preferred).
+            eps = copy.deepcopy(eps)
+            eps_len = len(eps)
+            # TODO (simon): Check, if we can deprecate these two
+            # variables and instead peek into the metrics.
+            self._num_timesteps += eps_len
+            self._num_timesteps_added += eps_len
+            num_env_steps_added += eps_len
+            # Ongoing episode, concat to existing record.
+            if eps.id_ in self.episode_id_to_index:
+                eps_idx = self.episode_id_to_index[eps.id_]
+                existing_eps = self.episodes[eps_idx - self._num_episodes_evicted]
+                old_len = len(existing_eps)
+                self._indices.extend([(eps_idx, old_len + i) for i in range(len(eps))])
+                existing_eps.concat_episode(eps)
+            # New episode. Add to end of our episodes deque.
+            else:
+                num_episodes_added += 1
+                self.episodes.append(eps)
+                eps_idx = len(self.episodes) - 1 + self._num_episodes_evicted
+                self.episode_id_to_index[eps.id_] = eps_idx
+                self._indices.extend([(eps_idx, i) for i in range(len(eps))])
+            # Eject old records from front of deque (only if we have more than 1 episode
+            # in the buffer).
+            while self._num_timesteps > self.capacity and self.get_num_episodes() > 1:
+                # Eject oldest episode.
+                evicted_eps = self.episodes.popleft()
+                evicted_eps_len = len(evicted_eps)
+                num_episodes_evicted += 1
+                num_env_steps_evicted += evicted_eps_len
+                # Correct our size.
+                self._num_timesteps -= evicted_eps_len
+                # Erase episode from all our indices:
+                # 1) Main episode index.
+                evicted_idx = self.episode_id_to_index[evicted_eps.id_]
+                del self.episode_id_to_index[evicted_eps.id_]
+                # 2) All timestep indices that this episode owned.
+                new_indices = []  # New indices that will replace self._indices.
+                idx_cursor = 0
+                # Loop through all (eps_idx, ts_in_eps_idx)-tuples.
+                for i, idx_tuple in enumerate(self._indices):
+                    # This tuple is part of the evicted episode -> Add everything
+                    # up until here to `new_indices` (excluding this very index, b/c
+                    # it's already part of the evicted episode).
+                    if idx_cursor is not None and idx_tuple[0] == evicted_idx:
+                        new_indices.extend(self._indices[idx_cursor:i])
+                        # Set to None to indicate we are in the eviction zone.
+                        idx_cursor = None
+                    # We are/have been in the eviction zone (i pointing/pointed to the
+                    # evicted episode) ..
+                    elif idx_cursor is None:
+                        # ... but are now not anymore (i is now an index into a
+                        # non-evicted episode) -> Set cursor to valid int again.
+                        if idx_tuple[0] != evicted_idx:
+                            idx_cursor = i
+                            # But early-out if evicted episode was only 1 single
+                            # timestep long.
+                            if evicted_eps_len == 1:
+                                break
+                        # Early-out: We reached the end of the to-be-evicted episode.
+                        # We can stop searching further here (all following tuples
+                        # will NOT be in the evicted episode).
+                        elif idx_tuple[1] == evicted_eps_len - 1:
+                            assert self._indices[i + 1][0] != idx_tuple[0]
+                            idx_cursor = i + 1
+                            break
+                # Jump over (splice-out) the evicted episode if we are still in the
+                # eviction zone.
+                if idx_cursor is not None:
+                    new_indices.extend(self._indices[idx_cursor:])
+                # Reset our `self._indices` to the newly compiled list.
+                self._indices = new_indices
+                # Increase episode evicted counter.
+                self._num_episodes_evicted += 1
+        self._update_add_metrics(
+            num_env_steps_added,
+            num_episodes_added,
+            num_episodes_evicted,
+            num_env_steps_evicted,
+        )
+    @OverrideToImplementCustomLogic_CallToSuperRecommended
+    def _update_add_metrics(
+        self,
+        num_timesteps_added: int,
+        num_episodes_added: int,
+        num_episodes_evicted: int,
+        num_env_steps_evicted: int,
+        **kwargs,
+    ) -> None:
+        """Updates the replay buffer's adding metrics.
+        Args:
+            num_timesteps_added: The total number of environment steps added to the
+                buffer in the `EpisodeReplayBuffer.add` call.
+            num_episodes_added: The total number of episodes added to the
+                buffer in the `EpisodeReplayBuffer.add` call.
+            num_episodes_evicted: The total number of environment steps evicted from
+                the buffer in the `EpisodeReplayBuffer.add` call. Note, this
+                does not include the number of episodes evicted before ever
+                added to the buffer (i.e. can happen in case a lot of episodes
+                were added and the buffer's capacity is not large enough).
+            num_env_steps_evicted: he total number of environment steps evicted from
+                the buffer in the `EpisodeReplayBuffer.add` call. Note, this
+                does not include the number of steps evicted before ever
+                added to the buffer (i.e. can happen in case a lot of episodes
+                were added and the buffer's capacity is not large enough).
+        """
+        # Get the actual number of agent steps residing in the buffer.
+        # TODO (simon): Write the same counters and getters as for the
+        # multi-agent buffers.
+        self.metrics.log_value(
+            (NUM_AGENT_STEPS_STORED, DEFAULT_AGENT_ID),
+            self.get_num_timesteps(),
+            reduce="mean",
+            window=self._metrics_num_episodes_for_smoothing,
+        )
+        # Number of timesteps added.
+        self.metrics.log_value(
+            (NUM_AGENT_STEPS_ADDED, DEFAULT_AGENT_ID),
+            num_timesteps_added,
+            reduce="sum",
+            clear_on_reduce=True,
+        )
+        self.metrics.log_value(
+            (NUM_AGENT_STEPS_ADDED_LIFETIME, DEFAULT_AGENT_ID),
+            num_timesteps_added,
+            reduce="sum",
+        )
+        self.metrics.log_value(
+            (NUM_AGENT_STEPS_EVICTED, DEFAULT_AGENT_ID),
+            num_env_steps_evicted,
+            reduce="sum",
+            clear_on_reduce=True,
+        )
+        self.metrics.log_value(
+            (NUM_AGENT_STEPS_EVICTED_LIFETIME, DEFAULT_AGENT_ID),
+            num_env_steps_evicted,
+            reduce="sum",
+        )
+        # Whole buffer step metrics.
+        self.metrics.log_value(
+            NUM_ENV_STEPS_STORED,
+            self.get_num_timesteps(),
+            reduce="mean",
+            window=self._metrics_num_episodes_for_smoothing,
+        )
+        self.metrics.log_value(
+            NUM_ENV_STEPS_ADDED,
+            num_timesteps_added,
+            reduce="sum",
+            clear_on_reduce=True,
+        )
+        self.metrics.log_value(
+            NUM_ENV_STEPS_ADDED_LIFETIME,
+            num_timesteps_added,
+            reduce="sum",
+        )
+        self.metrics.log_value(
+            NUM_ENV_STEPS_EVICTED,
+            num_env_steps_evicted,
+            reduce="sum",
+            clear_on_reduce=True,
+        )
+        self.metrics.log_value(
+            NUM_ENV_STEPS_EVICTED_LIFETIME,
+            num_env_steps_evicted,
+            reduce="sum",
+        )
+        # Episode metrics.
+        # Number of episodes in the buffer.
+        self.metrics.log_value(
+            (NUM_AGENT_EPISODES_STORED, DEFAULT_AGENT_ID),
+            self.get_num_episodes(),
+            reduce="mean",
+            window=self._metrics_num_episodes_for_smoothing,
+        )
+        # Number of new episodes added. Note, this metric could
+        # be zero.
+        self.metrics.log_value(
+            (NUM_AGENT_EPISODES_ADDED, DEFAULT_AGENT_ID),
+            num_episodes_added,
+            reduce="sum",
+            clear_on_reduce=True,
+        )
+        self.metrics.log_value(
+            (NUM_AGENT_EPISODES_ADDED_LIFETIME, DEFAULT_AGENT_ID),
+            num_episodes_added,
+            reduce="sum",
+        )
+        self.metrics.log_value(
+            (NUM_AGENT_EPISODES_EVICTED, DEFAULT_AGENT_ID),
+            num_episodes_evicted,
+            reduce="sum",
+            clear_on_reduce=True,
+        )
+        self.metrics.log_value(
+            (NUM_AGENT_EPISODES_EVICTED_LIFETIME, DEFAULT_AGENT_ID),
+            num_episodes_evicted,
+            reduce="sum",
+        )
+        # Whole buffer episode metrics.
+        self.metrics.log_value(
+            NUM_EPISODES_STORED,
+            self.get_num_episodes(),
+            reduce="mean",
+            window=self._metrics_num_episodes_for_smoothing,
+        )
+        # Number of new episodes added. Note, this metric could
+        # be zero.
+        self.metrics.log_value(
+            NUM_EPISODES_ADDED,
+            num_episodes_added,
+            reduce="sum",
+            clear_on_reduce=True,
+        )
+        self.metrics.log_value(
+            NUM_EPISODES_ADDED_LIFETIME,
+            num_episodes_added,
+            reduce="sum",
+        )
+        self.metrics.log_value(
+            NUM_EPISODES_EVICTED,
+            num_episodes_evicted,
+            reduce="sum",
+            clear_on_reduce=True,
+        )
+        self.metrics.log_value(
+            NUM_EPISODES_EVICTED_LIFETIME,
+            num_episodes_evicted,
+            reduce="sum",
+        )
+    @override(ReplayBufferInterface)
+    def sample(
+        self,
+        num_items: Optional[int] = None,
+        *,
+        batch_size_B: Optional[int] = None,
+        batch_length_T: Optional[int] = None,
+        n_step: Optional[Union[int, Tuple]] = None,
+        beta: float = 0.0,
+        gamma: float = 0.99,
+        include_infos: bool = False,
+        include_extra_model_outputs: bool = False,
+        sample_episodes: Optional[bool] = False,
+        to_numpy: bool = False,
+        # TODO (simon): Check, if we need here 1 as default.
+        lookback: int = 0,
+        min_batch_length_T: int = 0,
+        **kwargs,
+    ) -> Union[SampleBatchType, SingleAgentEpisode]:
+        """Samples from a buffer in a randomized way.
+        Each sampled item defines a transition of the form:
+        `(o_t, a_t, sum(r_(t+1:t+n+1)), o_(t+n), terminated_(t+n), truncated_(t+n))`
+        where `o_t` is drawn by randomized sampling.`n` is defined by the `n_step`
+        applied.
+        If requested, `info`s of a transitions last timestep `t+n` and respective
+        extra model outputs (e.g. action log-probabilities) are added to
+        the batch.
+        Args:
+            num_items: Number of items (transitions) to sample from this
+                buffer.
+            batch_size_B: The number of rows (transitions) to return in the
+                batch
+            batch_length_T: THe sequence length to sample. At this point in time
+                only sequences of length 1 are possible.
+            n_step: The n-step to apply. For the default the batch contains in
+                `"new_obs"` the observation and in `"obs"` the observation `n`
+                time steps before. The reward will be the sum of rewards
+                collected in between these two observations and the action will
+                be the one executed n steps before such that we always have the
+                state-action pair that triggered the rewards.
+                If `n_step` is a tuple, it is considered as a range to sample
+                from. If `None`, we use `n_step=1`.
+            gamma: The discount factor to be used when applying n-step calculations.
+                The default of `0.99` should be replaced by the `Algorithm`s
+                discount factor.
+            include_infos: A boolean indicating, if `info`s should be included in
+                the batch. This could be of advantage, if the `info` contains
+                values from the environment important for loss computation. If
+                `True`, the info at the `"new_obs"` in the batch is included.
+            include_extra_model_outputs: A boolean indicating, if
+                `extra_model_outputs` should be included in the batch. This could be
+                of advantage, if the `extra_mdoel_outputs`  contain outputs from the
+                model important for loss computation and only able to compute with the
+                actual state of model e.g. action log-probabilities, etc.). If `True`,
+                the extra model outputs at the `"obs"` in the batch is included (the
+                timestep at which the action is computed).
+            to_numpy: If episodes should be numpy'ized.
+            lookback: A desired lookback. Any non-negative integer is valid.
+            min_batch_length_T: An optional minimal length when sampling sequences. It
+                ensures that sampled sequences are at least `min_batch_length_T` time
+                steps long. This can be used to prevent empty sequences during
+                learning, when using a burn-in period for stateful `RLModule`s. In rare
+                cases, such as when episodes are very short early in training, this may
+                result in longer sampling times.
+        Returns:
+            Either a batch with transitions in each row or (if `return_episodes=True`)
+            a list of 1-step long episodes containing all basic episode data and if
+            requested infos and extra model outputs.
+        """
+        if sample_episodes:
+            return self._sample_episodes(
+                num_items=num_items,
+                batch_size_B=batch_size_B,
+                batch_length_T=batch_length_T,
+                n_step=n_step,
+                beta=beta,
+                gamma=gamma,
+                include_infos=include_infos,
+                include_extra_model_outputs=include_extra_model_outputs,
+                to_numpy=to_numpy,
+                lookback=lookback,
+                min_batch_length_T=min_batch_length_T,
+            )
+        else:
+            return self._sample_batch(
+                num_items=num_items,
+                batch_size_B=batch_size_B,
+                batch_length_T=batch_length_T,
+            )
+    def _sample_batch(
+        self,
+        num_items: Optional[int] = None,
+        *,
+        batch_size_B: Optional[int] = None,
+        batch_length_T: Optional[int] = None,
+    ) -> SampleBatchType:
+        """Returns a batch of size B (number of "rows"), where each row has length T.
+        Each row contains consecutive timesteps from an episode, but might not start
+        at the beginning of that episode. Should an episode end within such a
+        row (trajectory), a random next episode (starting from its t0) will be
+        concatenated to that row. For more details, see the docstring of the
+        EpisodeReplayBuffer class.
+        Args:
+            num_items: See `batch_size_B`. For compatibility with the
+                `ReplayBufferInterface` abstract base class.
+            batch_size_B: The number of rows (trajectories) to return in the batch.
+            batch_length_T: The length of each row (in timesteps) to return in the
+                batch.
+        Returns:
+            The sampled batch (observations, actions, rewards, terminateds, truncateds)
+                of dimensions [B, T, ...].
+        """
+        if num_items is not None:
+            assert batch_size_B is None, (
+                "Cannot call `sample()` with both `num_items` and `batch_size_B` "
+                "provided! Use either one."
+            )
+            batch_size_B = num_items
+        # Use our default values if no sizes/lengths provided.
+        batch_size_B = batch_size_B or self.batch_size_B
+        batch_length_T = batch_length_T or self.batch_length_T
+        # Rows to return.
+        observations = [[] for _ in range(batch_size_B)]
+        actions = [[] for _ in range(batch_size_B)]
+        rewards = [[] for _ in range(batch_size_B)]
+        is_first = [[False] * batch_length_T for _ in range(batch_size_B)]
+        is_last = [[False] * batch_length_T for _ in range(batch_size_B)]
+        is_terminated = [[False] * batch_length_T for _ in range(batch_size_B)]
+        is_truncated = [[False] * batch_length_T for _ in range(batch_size_B)]
+        # Record all the env step buffer indices that are contained in the sample.
+        sampled_env_step_idxs = set()
+        # Record all the episode buffer indices that are contained in the sample.
+        sampled_episode_idxs = set()
+        B = 0
+        T = 0
+        while B < batch_size_B:
+            # Pull a new uniform random index tuple: (eps_idx, ts_in_eps_idx).
+            index_tuple = self._indices[self.rng.integers(len(self._indices))]
+            # Compute the actual episode index (offset by the number of
+            # already evicted episodes).
+            episode_idx, episode_ts = (
+                index_tuple[0] - self._num_episodes_evicted,
+                index_tuple[1],
+            )
+            episode = self.episodes[episode_idx]
+            # Starting a new chunk, set is_first to True.
+            is_first[B][T] = True
+            # Begin of new batch item (row).
+            if len(rewards[B]) == 0:
+                # And we are at the start of an episode: Set reward to 0.0.
+                if episode_ts == 0:
+                    rewards[B].append(0.0)
+                # We are in the middle of an episode: Set reward to the previous
+                # timestep's values.
+                else:
+                    rewards[B].append(episode.rewards[episode_ts - 1])
+            # We are in the middle of a batch item (row). Concat next episode to this
+            # row from the next episode's beginning. In other words, we never concat
+            # a middle of an episode to another truncated one.
+            else:
+                episode_ts = 0
+                rewards[B].append(0.0)
+            observations[B].extend(episode.observations[episode_ts:])
+            # Repeat last action to have the same number of actions than observations.
+            actions[B].extend(episode.actions[episode_ts:])
+            actions[B].append(episode.actions[-1])
+            # Number of rewards are also the same as observations b/c we have the
+            # initial 0.0 one.
+            rewards[B].extend(episode.rewards[episode_ts:])
+            assert len(observations[B]) == len(actions[B]) == len(rewards[B])
+            T = min(len(observations[B]), batch_length_T)
+            # Set is_last=True.
+            is_last[B][T - 1] = True
+            # If episode is terminated and we have reached the end of it, set
+            # is_terminated=True.
+            if episode.is_terminated and T == len(observations[B]):
+                is_terminated[B][T - 1] = True
+            # If episode is truncated and we have reached the end of it, set
+            # is_truncated=True.
+            elif episode.is_truncated and T == len(observations[B]):
+                is_truncated[B][T - 1] = True
+            # We are done with this batch row.
+            if T == batch_length_T:
+                # We may have overfilled this row: Clip trajectory at the end.
+                observations[B] = observations[B][:batch_length_T]
+                actions[B] = actions[B][:batch_length_T]
+                rewards[B] = rewards[B][:batch_length_T]
+                # Start filling the next row.
+                B += 1
+                T = 0
+            # Add the episode buffer index to the set of episode indexes.
+            sampled_episode_idxs.add(episode_idx)
+            # Record a has for the episode ID and timestep inside of the episode.
+            sampled_env_step_idxs.add(
+                hashlib.sha256(f"{episode.id_}-{episode_ts}".encode()).hexdigest()
+            )
+        # Update our sampled counter.
+        self.sampled_timesteps += batch_size_B * batch_length_T
+        # Update the sample metrics.
+        self._update_sample_metrics(
+            num_env_steps_sampled=batch_size_B * batch_length_T,
+            num_episodes_per_sample=len(sampled_episode_idxs),
+            num_env_steps_per_sample=len(sampled_env_step_idxs),
+            sampled_n_step=None,
+        )
+        # TODO: Return SampleBatch instead of this simpler dict.
+        ret = {
+            "obs": np.array(observations),
+            "actions": np.array(actions),
+            "rewards": np.array(rewards),
+            "is_first": np.array(is_first),
+            "is_last": np.array(is_last),
+            "is_terminated": np.array(is_terminated),
+            "is_truncated": np.array(is_truncated),
+        }
+        return ret
+    def _sample_episodes(
+        self,
+        num_items: Optional[int] = None,
+        *,
+        batch_size_B: Optional[int] = None,
+        batch_length_T: Optional[int] = None,
+        n_step: Optional[Union[int, Tuple]] = None,
+        gamma: float = 0.99,
+        include_infos: bool = False,
+        include_extra_model_outputs: bool = False,
+        to_numpy: bool = False,
+        lookback: int = 1,
+        min_batch_length_T: int = 0,
+        **kwargs,
+    ) -> List[SingleAgentEpisode]:
+        """Samples episodes from a buffer in a randomized way.
+        Each sampled item defines a transition of the form:
+        `(o_t, a_t, sum(r_(t+1:t+n+1)), o_(t+n), terminated_(t+n), truncated_(t+n))`
+        where `o_t` is drawn by randomized sampling.`n` is defined by the `n_step`
+        applied.
+        If requested, `info`s of a transitions last timestep `t+n` and respective
+        extra model outputs (e.g. action log-probabilities) are added to
+        the batch.
+        Args:
+            num_items: Number of items (transitions) to sample from this
+                buffer.
+            batch_size_B: The number of rows (transitions) to return in the
+                batch
+            batch_length_T: The sequence length to sample. Can be either `None`
+                (the default) or any positive integer.
+            n_step: The n-step to apply. For the default the batch contains in
+                `"new_obs"` the observation and in `"obs"` the observation `n`
+                time steps before. The reward will be the sum of rewards
+                collected in between these two observations and the action will
+                be the one executed n steps before such that we always have the
+                state-action pair that triggered the rewards.
+                If `n_step` is a tuple, it is considered as a range to sample
+                from. If `None`, we use `n_step=1`.
+            gamma: The discount factor to be used when applying n-step calculations.
+                The default of `0.99` should be replaced by the `Algorithm`s
+                discount factor.
+            include_infos: A boolean indicating, if `info`s should be included in
+                the batch. This could be of advantage, if the `info` contains
+                values from the environment important for loss computation. If
+                `True`, the info at the `"new_obs"` in the batch is included.
+            include_extra_model_outputs: A boolean indicating, if
+                `extra_model_outputs` should be included in the batch. This could be
+                of advantage, if the `extra_mdoel_outputs`  contain outputs from the
+                model important for loss computation and only able to compute with the
+                actual state of model e.g. action log-probabilities, etc.). If `True`,
+                the extra model outputs at the `"obs"` in the batch is included (the
+                timestep at which the action is computed).
+            to_numpy: If episodes should be numpy'ized.
+            lookback: A desired lookback. Any non-negative integer is valid.
+            min_batch_length_T: An optional minimal length when sampling sequences. It
+                ensures that sampled sequences are at least `min_batch_length_T` time
+                steps long. This can be used to prevent empty sequences during
+                learning, when using a burn-in period for stateful `RLModule`s. In rare
+                cases, such as when episodes are very short early in training, this may
+                result in longer sampling times.
+        Returns:
+            A list of 1-step long episodes containing all basic episode data and if
+            requested infos and extra model outputs.
+        """
+        if num_items is not None:
+            assert batch_size_B is None, (
+                "Cannot call `sample()` with both `num_items` and `batch_size_B` "
+                "provided! Use either one."
+            )
+            batch_size_B = num_items
+        # Use our default values if no sizes/lengths provided.
+        batch_size_B = batch_size_B or self.batch_size_B
+        assert n_step is not None, (
+            "When sampling episodes, `n_step` must be "
+            "provided, but `n_step` is `None`."
+        )
+        # If no sequence should be sampled, we sample n-steps.
+        if not batch_length_T:
+            # Sample the `n_step`` itself, if necessary.
+            actual_n_step = n_step
+            random_n_step = isinstance(n_step, tuple)
+        # Otherwise we use an n-step of 1.
+        else:
+            assert (
+                not isinstance(n_step, tuple) and n_step == 1
+            ), "When sampling sequences n-step must be 1."
+            actual_n_step = n_step
+        # Keep track of the indices that were sampled last for updating the
+        # weights later (see `ray.rllib.utils.replay_buffer.utils.
+        # update_priorities_in_episode_replay_buffer`).
+        self._last_sampled_indices = []
+        sampled_episodes = []
+        # Record all the env step buffer indices that are contained in the sample.
+        sampled_env_step_idxs = set()
+        # Record all the episode buffer indices that are contained in the sample.
+        sampled_episode_idxs = set()
+        # Record all n-steps that have been used.
+        sampled_n_steps = []
+        B = 0
+        while B < batch_size_B:
+            # Pull a new uniform random index tuple: (eps_idx, ts_in_eps_idx).
+            index_tuple = self._indices[self.rng.integers(len(self._indices))]
+            # Compute the actual episode index (offset by the number of
+            # already evicted episodes).
+            episode_idx, episode_ts = (
+                index_tuple[0] - self._num_episodes_evicted,
+                index_tuple[1],
+            )
+            episode = self.episodes[episode_idx]
+            # If we use random n-step sampling, draw the n-step for this item.
+            if not batch_length_T and random_n_step:
+                actual_n_step = int(self.rng.integers(n_step[0], n_step[1]))
+            # Skip, if we are too far to the end and `episode_ts` + n_step would go
+            # beyond the episode's end.
+            if min_batch_length_T > 0 and episode_ts + min_batch_length_T >= len(
+                episode
+            ):
+                continue
+            if episode_ts + (batch_length_T or 0) + (actual_n_step - 1) > len(episode):
+                actual_length = len(episode)
+            else:
+                actual_length = episode_ts + (batch_length_T or 0) + (actual_n_step - 1)
+            # If no sequence should be sampled, we sample here the n-step.
+            if not batch_length_T:
+                sampled_episode = episode.slice(
+                    slice(
+                        episode_ts,
+                        episode_ts + actual_n_step,
+                    )
+                )
+                # Note, this will be the reward after executing action
+                # `a_(episode_ts-n_step+1)`. For `n_step>1` this will be the discounted
+                # sum of all discounted rewards that were collected over the last n
+                # steps.
+                raw_rewards = sampled_episode.get_rewards()
+                rewards = scipy.signal.lfilter(
+                    [1], [1, -gamma], raw_rewards[::-1], axis=0
+                )[-1]
+                sampled_episode = SingleAgentEpisode(
+                    id_=sampled_episode.id_,
+                    agent_id=sampled_episode.agent_id,
+                    module_id=sampled_episode.module_id,
+                    observation_space=sampled_episode.observation_space,
+                    action_space=sampled_episode.action_space,
+                    observations=[
+                        sampled_episode.get_observations(0),
+                        sampled_episode.get_observations(-1),
+                    ],
+                    actions=[sampled_episode.get_actions(0)],
+                    rewards=[rewards],
+                    infos=[
+                        sampled_episode.get_infos(0),
+                        sampled_episode.get_infos(-1),
+                    ],
+                    terminated=sampled_episode.is_terminated,
+                    truncated=sampled_episode.is_truncated,
+                    extra_model_outputs={
+                        **(
+                            {
+                                k: [episode.get_extra_model_outputs(k, 0)]
+                                for k in episode.extra_model_outputs.keys()
+                            }
+                            if include_extra_model_outputs
+                            else {}
+                        ),
+                    },
+                    t_started=episode_ts,
+                    len_lookback_buffer=0,
+                )
+            # Otherwise we simply slice the episode.
+            else:
+                sampled_episode = episode.slice(
+                    slice(
+                        episode_ts,
+                        actual_length,
+                    ),
+                    len_lookback_buffer=lookback,
+                )
+            # Record a has for the episode ID and timestep inside of the episode.
+            sampled_env_step_idxs.add(
+                hashlib.sha256(f"{episode.id_}-{episode_ts}".encode()).hexdigest()
+            )
+            # Remove reference to sampled episode.
+            del episode
+            # Add the actually chosen n-step in this episode.
+            sampled_episode.extra_model_outputs["n_step"] = InfiniteLookbackBuffer(
+                np.full((len(sampled_episode) + lookback,), actual_n_step),
+                lookback=lookback,
+            )
+            # Some loss functions need `weights` - which are only relevant when
+            # prioritizing.
+            sampled_episode.extra_model_outputs["weights"] = InfiniteLookbackBuffer(
+                np.ones((len(sampled_episode) + lookback,)), lookback=lookback
+            )
+            # Append the sampled episode.
+            sampled_episodes.append(sampled_episode)
+            sampled_episode_idxs.add(episode_idx)
+            sampled_n_steps.append(actual_n_step)
+            # Increment counter.
+            B += (actual_length - episode_ts - (actual_n_step - 1) + 1) or 1
+        # Update the metric.
+        self.sampled_timesteps += batch_size_B
+        # Update the sample metrics.
+        self._update_sample_metrics(
+            batch_size_B,
+            len(sampled_episode_idxs),
+            len(sampled_env_step_idxs),
+            sum(sampled_n_steps) / batch_size_B,
+        )
+        return sampled_episodes
+    @OverrideToImplementCustomLogic_CallToSuperRecommended
+    def _update_sample_metrics(
+        self,
+        num_env_steps_sampled: int,
+        num_episodes_per_sample: int,
+        num_env_steps_per_sample: int,
+        sampled_n_step: Optional[float],
+        **kwargs: Dict[str, Any],
+    ) -> None:
+        """Updates the replay buffer's sample metrics.
+        Args:
+            num_env_steps_sampled: The number of environment steps sampled
+                this iteration in the `sample` method.
+            num_episodes_per_sample: The number of unique episodes in the
+                sample.
+            num_env_steps_per_sample: The number of unique environment steps
+                in the sample.
+            sampled_n_step: The mean n-step used in the sample. Note, this
+                is constant, if the n-step is not sampled.
+        """
+        if sampled_n_step:
+            self.metrics.log_value(
+                ACTUAL_N_STEP,
+                sampled_n_step,
+                reduce="mean",
+                window=self._metrics_num_episodes_for_smoothing,
+            )
+            self.metrics.log_value(
+                (AGENT_ACTUAL_N_STEP, DEFAULT_AGENT_ID),
+                sampled_n_step,
+                reduce="mean",
+                window=self._metrics_num_episodes_for_smoothing,
+            )
+        self.metrics.log_value(
+            (NUM_AGENT_EPISODES_PER_SAMPLE, DEFAULT_AGENT_ID),
+            num_episodes_per_sample,
+            reduce="sum",
+            clear_on_reduce=True,
+        )
+        self.metrics.log_value(
+            (NUM_AGENT_STEPS_PER_SAMPLE, DEFAULT_AGENT_ID),
+            num_env_steps_per_sample,
+            reduce="sum",
+            clear_on_reduce=True,
+        )
+        self.metrics.log_value(
+            (NUM_AGENT_STEPS_PER_SAMPLE_LIFETIME, DEFAULT_AGENT_ID),
+            num_env_steps_per_sample,
+            reduce="sum",
+        )
+        self.metrics.log_value(
+            (NUM_AGENT_STEPS_SAMPLED, DEFAULT_AGENT_ID),
+            num_env_steps_sampled,
+            reduce="sum",
+            clear_on_reduce=True,
+        )
+        # TODO (simon): Check, if we can then deprecate
+        # self.sampled_timesteps.
+        self.metrics.log_value(
+            (NUM_AGENT_STEPS_SAMPLED_LIFETIME, DEFAULT_AGENT_ID),
+            num_env_steps_sampled,
+            reduce="sum",
+        )
+        self.metrics.log_value(
+            (AGENT_STEP_UTILIZATION, DEFAULT_AGENT_ID),
+            self.metrics.peek((NUM_AGENT_STEPS_PER_SAMPLE_LIFETIME, DEFAULT_AGENT_ID))
+            / self.metrics.peek((NUM_AGENT_STEPS_SAMPLED_LIFETIME, DEFAULT_AGENT_ID)),
+            reduce="mean",
+            window=self._metrics_num_episodes_for_smoothing,
+        )
+        # Whole buffer sampled env steps metrics.
+        self.metrics.log_value(
+            NUM_EPISODES_PER_SAMPLE,
+            num_episodes_per_sample,
+            reduce="sum",
+            clear_on_reduce=True,
+        )
+        self.metrics.log_value(
+            NUM_ENV_STEPS_PER_SAMPLE,
+            num_env_steps_per_sample,
+            reduce="sum",
+            clear_on_reduce=True,
+        )
+        self.metrics.log_value(
+            NUM_ENV_STEPS_PER_SAMPLE_LIFETIME,
+            num_env_steps_per_sample,
+            reduce="sum",
+        )
+        self.metrics.log_value(
+            NUM_ENV_STEPS_SAMPLED,
+            num_env_steps_sampled,
+            reduce="sum",
+            clear_on_reduce=True,
+        )
+        self.metrics.log_value(
+            NUM_ENV_STEPS_SAMPLED_LIFETIME,
+            num_env_steps_sampled,
+            reduce="sum",
+        )
+        self.metrics.log_value(
+            ENV_STEP_UTILIZATION,
+            self.metrics.peek(NUM_ENV_STEPS_PER_SAMPLE_LIFETIME)
+            / self.metrics.peek(NUM_ENV_STEPS_SAMPLED_LIFETIME),
+            reduce="mean",
+            window=self._metrics_num_episodes_for_smoothing,
+        )
+    # TODO (simon): Check, if we can instead peek into the metrics
+    # and deprecate all variables.
+    def get_num_episodes(self) -> int:
+        """Returns number of episodes (completed or truncated) stored in the buffer."""
+        return len(self.episodes)
+    def get_num_episodes_evicted(self) -> int:
+        """Returns number of episodes that have been evicted from the buffer."""
+        return self._num_episodes_evicted
+    def get_num_timesteps(self) -> int:
+        """Returns number of individual timesteps stored in the buffer."""
+        return len(self._indices)
+    def get_sampled_timesteps(self) -> int:
+        """Returns number of timesteps that have been sampled in buffer's lifetime."""
+        return self.sampled_timesteps
+    def get_added_timesteps(self) -> int:
+        """Returns number of timesteps that have been added in buffer's lifetime."""
+        return self._num_timesteps_added
+    def get_metrics(self) -> ResultDict:
+        """Returns the metrics of the buffer and reduces them."""
+        return self.metrics.reduce()
+    @override(ReplayBufferInterface)
+    def get_state(self) -> Dict[str, Any]:
+        """Gets a pickable state of the buffer.
+        This is used for checkpointing the buffer's state. It is specifically helpful,
+        for example, when a trial is paused and resumed later on. The buffer's state
+        can be saved to disk and reloaded when the trial is resumed.
+        Returns:
+            A dict containing all necessary information to restore the buffer's state.
+        """
+        return {
+            "episodes": [eps.get_state() for eps in self.episodes],
+            "episode_id_to_index": list(self.episode_id_to_index.items()),
+            "_num_episodes_evicted": self._num_episodes_evicted,
+            "_indices": self._indices,
+            "_num_timesteps": self._num_timesteps,
+            "_num_timesteps_added": self._num_timesteps_added,
+            "sampled_timesteps": self.sampled_timesteps,
+        }
+    @override(ReplayBufferInterface)
+    def set_state(self, state) -> None:
+        """Sets the state of a buffer from a previously stored state.
+        See `get_state()` for more information on what is stored in the state. This
+        method is used to restore the buffer's state from a previously stored state.
+        It is specifically helpful, for example, when a trial is paused and resumed
+        later on. The buffer's state can be saved to disk and reloaded when the trial
+        is resumed.
+        Args:
+            state: The state to restore the buffer from.
+        """
+        self._set_episodes(state)
+        self.episode_id_to_index = dict(state["episode_id_to_index"])
+        self._num_episodes_evicted = state["_num_episodes_evicted"]
+        self._indices = state["_indices"]
+        self._num_timesteps = state["_num_timesteps"]
+        self._num_timesteps_added = state["_num_timesteps_added"]
+        self.sampled_timesteps = state["sampled_timesteps"]
+    def _set_episodes(self, state) -> None:
+        """Sets the episodes from the state.
+        Note, this method is used for class inheritance purposes. It is specifically
+        helpful when a subclass of this class wants to override the behavior of how
+        episodes are set from the state. By default, it sets `SingleAgentEpuisode`s,
+        but subclasses can override this method to set episodes of a different type.
+        """
+        if not self.episodes:
+            self.episodes = deque(
+                [
+                    SingleAgentEpisode.from_state(eps_data)
+                    for eps_data in state["episodes"]
+                ]
+            )

.venv/lib/python3.11/site-packages/ray/rllib/utils/replay_buffers/fifo_replay_buffer.py ADDED Viewed

	@@ -0,0 +1,109 @@

+import numpy as np
+from typing import Any, Dict, Optional
+from ray.rllib.policy.sample_batch import MultiAgentBatch
+from ray.rllib.utils.annotations import override
+from ray.rllib.utils.replay_buffers.replay_buffer import ReplayBuffer, StorageUnit
+from ray.rllib.utils.typing import SampleBatchType
+from ray.util.annotations import DeveloperAPI
+@DeveloperAPI
+class FifoReplayBuffer(ReplayBuffer):
+    """This replay buffer implements a FIFO queue.
+    Sometimes, e.g. for offline use cases, it may be desirable to use
+    off-policy algorithms without a Replay Buffer.
+    This FifoReplayBuffer can be used in-place to achieve the same effect
+    without having to introduce separate algorithm execution branches.
+    For simplicity and efficiency reasons, this replay buffer stores incoming
+    sample batches as-is, and returns them one at time.
+    This is to avoid any additional load when this replay buffer is used.
+    """
+    def __init__(self, *args, **kwargs):
+        """Initializes a FifoReplayBuffer.
+        Args:
+            ``*args``   : Forward compatibility args.
+            ``**kwargs``: Forward compatibility kwargs.
+        """
+        # Completely by-passing underlying ReplayBuffer by setting its
+        # capacity to 1 (lowest allowed capacity).
+        ReplayBuffer.__init__(self, 1, StorageUnit.FRAGMENTS, **kwargs)
+        self._queue = []
+    @DeveloperAPI
+    @override(ReplayBuffer)
+    def add(self, batch: SampleBatchType, **kwargs) -> None:
+        return self._queue.append(batch)
+    @DeveloperAPI
+    @override(ReplayBuffer)
+    def sample(self, *args, **kwargs) -> Optional[SampleBatchType]:
+        """Sample a saved training batch from this buffer.
+        Args:
+            ``*args``   : Forward compatibility args.
+            ``**kwargs``: Forward compatibility kwargs.
+        Returns:
+            A single training batch from the queue.
+        """
+        if len(self._queue) <= 0:
+            # Return empty SampleBatch if queue is empty.
+            return MultiAgentBatch({}, 0)
+        batch = self._queue.pop(0)
+        # Equal weights of 1.0.
+        batch["weights"] = np.ones(len(batch))
+        return batch
+    @DeveloperAPI
+    def update_priorities(self, *args, **kwargs) -> None:
+        """Update priorities of items at given indices.
+        No-op for this replay buffer.
+        Args:
+            ``*args``   : Forward compatibility args.
+            ``**kwargs``: Forward compatibility kwargs.
+        """
+        pass
+    @DeveloperAPI
+    @override(ReplayBuffer)
+    def stats(self, debug: bool = False) -> Dict:
+        """Returns the stats of this buffer.
+        Args:
+            debug: If true, adds sample eviction statistics to the returned stats dict.
+        Returns:
+            A dictionary of stats about this buffer.
+        """
+        # As if this replay buffer has never existed.
+        return {}
+    @DeveloperAPI
+    @override(ReplayBuffer)
+    def get_state(self) -> Dict[str, Any]:
+        """Returns all local state.
+        Returns:
+            The serializable local state.
+        """
+        # Pass through replay buffer does not save states.
+        return {}
+    @DeveloperAPI
+    @override(ReplayBuffer)
+    def set_state(self, state: Dict[str, Any]) -> None:
+        """Restores all local state to the provided `state`.
+        Args:
+            state: The new state to set this buffer. Can be obtained by calling
+            `self.get_state()`.
+        """
+        pass

.venv/lib/python3.11/site-packages/ray/rllib/utils/replay_buffers/multi_agent_episode_buffer.py ADDED Viewed

	@@ -0,0 +1,1026 @@

+import copy
+from collections import defaultdict, deque
+from gymnasium.core import ActType, ObsType
+import numpy as np
+import scipy
+from typing import Any, Dict, List, Optional, Set, Tuple, Union
+from ray.rllib.core.columns import Columns
+from ray.rllib.env.multi_agent_episode import MultiAgentEpisode
+from ray.rllib.env.single_agent_episode import SingleAgentEpisode
+from ray.rllib.utils.replay_buffers.episode_replay_buffer import EpisodeReplayBuffer
+from ray.rllib.utils import force_list
+from ray.rllib.utils.annotations import override, DeveloperAPI
+from ray.rllib.utils.spaces.space_utils import batch
+from ray.rllib.utils.typing import AgentID, ModuleID, SampleBatchType
+@DeveloperAPI
+class MultiAgentEpisodeReplayBuffer(EpisodeReplayBuffer):
+    """Multi-agent episode replay buffer that stores episodes by their IDs.
+    This class implements a replay buffer as used in "playing Atari with Deep
+    Reinforcement Learning" (Mnih et al., 2013) for multi-agent reinforcement
+    learning,
+    Each "row" (a slot in a deque) in the buffer is occupied by one episode. If an
+    incomplete episode is added to the buffer and then another chunk of that episode is
+    added at a later time, the buffer will automatically concatenate the new fragment to
+    the original episode. This way, episodes can be completed via subsequent `add`
+    calls.
+    Sampling returns a size `B` episode list (number of 'rows'), where each episode
+    holds a tuple tuple of the form
+    `(o_t, a_t, sum(r_t+1:t+n), o_t+n)`
+    where `o_t` is the observation in `t`, `a_t` the action chosen at observation `o_t`,
+    `o_t+n` is the observation `n` timesteps later and `sum(r_t+1:t+n)` is the sum of
+    all rewards collected over the time steps between `t+1` and `t+n`. The `n`-step can
+    be chosen freely when sampling and defaults to `1`. If `n_step` is a tuple it is
+    sampled uniformly across the interval defined by the tuple (for each row in the
+    batch).
+    Each episode contains - in addition to the data tuples presented above - two further
+    elements in its `extra_model_outputs`, namely `n_steps` and `weights`. The former
+    holds the `n_step` used for the sampled timesteps in the episode and the latter the
+    corresponding (importance sampling) weight for the transition.
+    .. testcode::
+        import gymnasium as gym
+        from ray.rllib.env.multi_agent_episode import MultiAgentEpisode
+        from ray.rllib.examples.envs.classes.multi_agent import MultiAgentCartPole
+        from ray.rllib.utils.replay_buffers import MultiAgentEpisodeReplayBuffer
+        # Create the environment.
+        env = MultiAgentCartPole({"num_agents": 2})
+        # Set up the loop variables
+        agent_ids = env.agents
+        agent_ids.append("__all__")
+        terminateds = {aid: False for aid in agent_ids}
+        truncateds = {aid: False for aid in agent_ids}
+        num_timesteps = 10000
+        episodes = []
+        # Initialize the first episode entries.
+        eps = MultiAgentEpisode()
+        obs, infos = env.reset()
+        eps.add_env_reset(observations=obs, infos=infos)
+        # Sample 10,000 env timesteps.
+        for i in range(num_timesteps):
+            # If terminated we create a new episode.
+            if eps.is_done:
+                episodes.append(eps.to_numpy())
+                eps = MultiAgentEpisode()
+                terminateds = {aid: False for aid in agent_ids}
+                truncateds = {aid: False for aid in agent_ids}
+                obs, infos = env.reset()
+                eps.add_env_reset(observations=obs, infos=infos)
+            # Sample a random action for all agents that should step in the episode
+            # next.
+            actions = {
+                aid: env.get_action_space(aid).sample()
+                for aid in eps.get_agents_to_act()
+            }
+            obs, rewards, terminateds, truncateds, infos = env.step(actions)
+            eps.add_env_step(
+                obs,
+                actions,
+                rewards,
+                infos,
+                terminateds=terminateds,
+                truncateds=truncateds
+            )
+        # Add the last (truncated) episode to the list of episodes.
+        if not eps.is_done:
+            episodes.append(eps)
+        # Create the buffer.
+        buffer = MultiAgentEpisodeReplayBuffer()
+        # Add the list of episodes sampled.
+        buffer.add(episodes)
+        # Pull a sample from the buffer using an `n-step` of 3.
+        sample = buffer.sample(num_items=256, gamma=0.95, n_step=3)
+    """
+    def __init__(
+        self,
+        capacity: int = 10000,
+        *,
+        batch_size_B: int = 16,
+        batch_length_T: int = 1,
+        **kwargs,
+    ):
+        """Initializes a multi-agent episode replay buffer.
+        Args:
+            capacity: The total number of timesteps to be storable in this buffer.
+                Will start ejecting old episodes once this limit is reached.
+            batch_size_B: The number of episodes returned from `sample()`.
+            batch_length_T: The length of each episode in the episode list returned from
+                `sample()`.
+        """
+        # Initialize the base episode replay buffer.
+        super().__init__(
+            capacity=capacity,
+            batch_size_B=batch_size_B,
+            batch_length_T=batch_length_T,
+            **kwargs,
+        )
+        # Stores indices of module (single-agent) timesteps. Each index is a tuple
+        # of the form:
+        #   `(ma_episode_idx, agent_id, timestep)`.
+        # This information is stored for each timestep of an episode and is used in
+        # the `"independent"`` sampling process. The multi-agent episode index amd the
+        # agent ID are used to retrieve the single-agent episode. The timestep is then
+        # needed to retrieve the corresponding timestep data from that single-agent
+        # episode.
+        self._module_to_indices: Dict[
+            ModuleID, List[Tuple[int, AgentID, int]]
+        ] = defaultdict(list)
+        # Stores the number of single-agent timesteps in the buffer.
+        self._num_agent_timesteps: int = 0
+        # Stores the number of single-agent timesteps per module.
+        self._num_module_timesteps: Dict[ModuleID, int] = defaultdict(int)
+        # Stores the number of added single-agent timesteps over the
+        # lifetime of the buffer.
+        self._num_agent_timesteps_added: int = 0
+        # Stores the number of added single-agent timesteps per module
+        # over the lifetime of the buffer.
+        self._num_module_timesteps_added: Dict[ModuleID, int] = defaultdict(int)
+        self._num_module_episodes: Dict[ModuleID, int] = defaultdict(int)
+        # Stores the number of module episodes evicted. Note, this is
+        # important for indexing.
+        self._num_module_episodes_evicted: Dict[ModuleID, int] = defaultdict(int)
+        # Stores hte number of module timesteps sampled.
+        self.sampled_timesteps_per_module: Dict[ModuleID, int] = defaultdict(int)
+    @override(EpisodeReplayBuffer)
+    def add(
+        self,
+        episodes: Union[List["MultiAgentEpisode"], "MultiAgentEpisode"],
+    ) -> None:
+        """Adds episodes to the replay buffer.
+        Note, if the incoming episodes' time steps cause the buffer to overflow,
+        older episodes are evicted. Because episodes usually come in chunks and
+        not complete, this could lead to edge cases (e.g. with very small capacity
+        or very long episode length) where the first part of an episode is evicted
+        while the next part just comes in.
+        To defend against such case, the complete episode is evicted, including
+        the new chunk, unless the episode is the only one in the buffer. In the
+        latter case the buffer will be allowed to overflow in a temporary fashion,
+        i.e. during the next addition of samples to the buffer an attempt is made
+        to fall below capacity again.
+        The user is advised to select a large enough buffer with regard to the maximum
+        expected episode length.
+        Args:
+            episodes: The multi-agent episodes to add to the replay buffer. Can be a
+                single episode or a list of episodes.
+        """
+        episodes: List["MultiAgentEpisode"] = force_list(episodes)
+        new_episode_ids: Set[str] = {eps.id_ for eps in episodes}
+        total_env_timesteps = sum([eps.env_steps() for eps in episodes])
+        self._num_timesteps += total_env_timesteps
+        self._num_timesteps_added += total_env_timesteps
+        # Evict old episodes.
+        eps_evicted_ids: Set[Union[str, int]] = set()
+        eps_evicted_idxs: Set[int] = set()
+        while (
+            self._num_timesteps > self.capacity
+            and self._num_remaining_episodes(new_episode_ids, eps_evicted_ids) != 1
+        ):
+            # Evict episode.
+            evicted_episode = self.episodes.popleft()
+            eps_evicted_ids.add(evicted_episode.id_)
+            eps_evicted_idxs.add(self.episode_id_to_index.pop(evicted_episode.id_))
+            # If this episode has a new chunk in the new episodes added,
+            # we subtract it again.
+            # TODO (sven, simon): Should we just treat such an episode chunk
+            # as a new episode?
+            if evicted_episode.id_ in new_episode_ids:
+                idx = next(
+                    i
+                    for i, eps in enumerate(episodes)
+                    if eps.id_ == evicted_episode.id_
+                )
+                new_eps_to_evict = episodes.pop(idx)
+                self._num_timesteps -= new_eps_to_evict.env_steps()
+                self._num_timesteps_added -= new_eps_to_evict.env_steps()
+            # Remove the timesteps of the evicted episode from the counter.
+            self._num_timesteps -= evicted_episode.env_steps()
+            self._num_agent_timesteps -= evicted_episode.agent_steps()
+            self._num_episodes_evicted += 1
+            # Remove the module timesteps of the evicted episode from the counters.
+            self._evict_module_episodes(evicted_episode)
+            del evicted_episode
+        # Add agent and module steps.
+        for eps in episodes:
+            self._num_agent_timesteps += eps.agent_steps()
+            self._num_agent_timesteps_added += eps.agent_steps()
+            # Update the module counters by the module timesteps.
+            self._update_module_counters(eps)
+        # Remove corresponding indices, if episodes were evicted.
+        if eps_evicted_idxs:
+            # If the episode is not exvicted, we keep the index.
+            # Note, ach index 2-tuple is of the form (ma_episode_idx, timestep) and
+            # refers to a certain environment timestep in a certain multi-agent
+            # episode.
+            self._indices = [
+                idx_tuple
+                for idx_tuple in self._indices
+                if idx_tuple[0] not in eps_evicted_idxs
+            ]
+            # Also remove corresponding module indices.
+            for module_id, module_indices in self._module_to_indices.items():
+                # Each index 3-tuple is of the form
+                # (ma_episode_idx, agent_id, timestep) and refers to a certain
+                # agent timestep in a certain multi-agent episode.
+                self._module_to_indices[module_id] = [
+                    idx_triplet
+                    for idx_triplet in module_indices
+                    if idx_triplet[0] not in eps_evicted_idxs
+                ]
+        for eps in episodes:
+            eps = copy.deepcopy(eps)
+            # If the episode is part of an already existing episode, concatenate.
+            if eps.id_ in self.episode_id_to_index:
+                eps_idx = self.episode_id_to_index[eps.id_]
+                existing_eps = self.episodes[eps_idx - self._num_episodes_evicted]
+                existing_len = len(existing_eps)
+                self._indices.extend(
+                    [
+                        (
+                            eps_idx,
+                            existing_len + i,
+                        )
+                        for i in range(len(eps))
+                    ]
+                )
+                # Add new module indices.
+                self._add_new_module_indices(eps, eps_idx, True)
+                # Concatenate the episode chunk.
+                existing_eps.concat_episode(eps)
+            # Otherwise, create a new entry.
+            else:
+                # New episode.
+                self.episodes.append(eps)
+                eps_idx = len(self.episodes) - 1 + self._num_episodes_evicted
+                self.episode_id_to_index[eps.id_] = eps_idx
+                self._indices.extend([(eps_idx, i) for i in range(len(eps))])
+                # Add new module indices.
+                self._add_new_module_indices(eps, eps_idx, False)
+    @override(EpisodeReplayBuffer)
+    def sample(
+        self,
+        num_items: Optional[int] = None,
+        *,
+        batch_size_B: Optional[int] = None,
+        batch_length_T: Optional[int] = None,
+        n_step: Optional[Union[int, Tuple]] = 1,
+        gamma: float = 0.99,
+        include_infos: bool = False,
+        include_extra_model_outputs: bool = False,
+        replay_mode: str = "independent",
+        modules_to_sample: Optional[List[ModuleID]] = None,
+        **kwargs,
+    ) -> Union[List["MultiAgentEpisode"], List["SingleAgentEpisode"]]:
+        """Samples a batch of multi-agent transitions.
+        Multi-agent transitions can be sampled either `"independent"` or
+        `"synchronized"` with the former sampling for each module independent agent
+        steps and the latter sampling agent transitions from the same environment step.
+        The n-step parameter can be either a single integer or a tuple of two integers.
+        In the former case, the n-step is fixed to the given integer and in the latter
+        case, the n-step is sampled uniformly from the given range. Large n-steps could
+        potentially lead to a many retries because not all samples might have a full
+        n-step transition.
+        Sampling returns batches of size B (number of 'rows'), where each row is a tuple
+        of the form
+        `(o_t, a_t, sum(r_t+1:t+n), o_t+n)`
+        where `o_t` is the observation in `t`, `a_t` the action chosen at observation
+        `o_t`, `o_t+n` is the observation `n` timesteps later and `sum(r_t+1:t+n)` is
+        the sum of all rewards collected over the time steps between `t+1` and `t+n`.
+        The n`-step can be chosen freely when sampling and defaults to `1`. If `n_step`
+        is a tuple it is sampled uniformly across the interval defined by the tuple (for
+        each row in the batch).
+        Each batch contains - in addition to the data tuples presented above - two
+        further columns, namely `n_steps` and `weigths`. The former holds the `n_step`
+        used for each row in the batch and the latter a (default) weight of `1.0` for
+        each row in the batch. This weight is used for weighted loss calculations in
+        the training process.
+        Args:
+            num_items: The number of items to sample. If provided, `batch_size_B`
+                should be `None`.
+            batch_size_B: The batch size to sample. If provided, `num_items`
+                should be `None`.
+            batch_length_T: The length of the sampled batch. If not provided, the
+                default batch length is used. This feature is not yet implemented.
+            n_step: The n-step to sample. If the n-step is a tuple, the n-step is
+                sampled uniformly from the given range. If not provided, the default
+                n-step of `1` is used.
+            gamma: The discount factor for the n-step reward calculation.
+            include_infos: Whether to include the infos in the sampled batch.
+            include_extra_model_outputs: Whether to include the extra model outputs
+                in the sampled batch.
+            replay_mode: The replay mode to use for sampling. Either `"independent"`
+                or `"synchronized"`.
+            modules_to_sample: A list of module IDs to sample from. If not provided,
+                transitions for aall modules are sampled.
+        Returns:
+            A dictionary of the form `ModuleID -> SampleBatchType` containing the
+            sampled data for each module or each module in `modules_to_sample`,
+            if provided.
+        """
+        if num_items is not None:
+            assert batch_size_B is None, (
+                "Cannot call `sample()` with both `num_items` and `batch_size_B` "
+                "provided! Use either one."
+            )
+            batch_size_B = num_items
+        # Use our default values if no sizes/lengths provided.
+        batch_size_B = batch_size_B or self.batch_size_B
+        # TODO (simon): Implement trajectory sampling for RNNs.
+        batch_length_T = batch_length_T or self.batch_length_T
+        # Sample for each module independently.
+        if replay_mode == "independent":
+            return self._sample_independent(
+                batch_size_B=batch_size_B,
+                batch_length_T=batch_length_T,
+                n_step=n_step,
+                gamma=gamma,
+                include_infos=include_infos,
+                include_extra_model_outputs=include_extra_model_outputs,
+                modules_to_sample=modules_to_sample,
+            )
+        else:
+            return self._sample_synchonized(
+                batch_size_B=batch_size_B,
+                batch_length_T=batch_length_T,
+                n_step=n_step,
+                gamma=gamma,
+                include_infos=include_infos,
+                include_extra_model_outputs=include_extra_model_outputs,
+                modules_to_sample=modules_to_sample,
+            )
+    def get_added_agent_timesteps(self) -> int:
+        """Returns number of agent timesteps that have been added in buffer's lifetime.
+        Note, this could be more than the `get_added_timesteps` returns as an
+        environment timestep could contain multiple agent timesteps (for eaxch agent
+        one).
+        """
+        return self._num_agent_timesteps_added
+    def get_module_ids(self) -> List[ModuleID]:
+        """Returns a list of module IDs stored in the buffer."""
+        return list(self._module_to_indices.keys())
+    def get_num_agent_timesteps(self) -> int:
+        """Returns number of agent timesteps stored in the buffer.
+        Note, this could be more than the `num_timesteps` as an environment timestep
+        could contain multiple agent timesteps (for eaxch agent one).
+        """
+        return self._num_agent_timesteps
+    @override(EpisodeReplayBuffer)
+    def get_num_episodes(self, module_id: Optional[ModuleID] = None) -> int:
+        """Returns number of episodes stored for a module in the buffer.
+        Note, episodes could be either complete or truncated.
+        Args:
+            module_id: The ID of the module to query. If not provided, the number of
+                episodes for all modules is returned.
+        Returns:
+            The number of episodes stored for the module or all modules.
+        """
+        return (
+            self._num_module_episodes[module_id]
+            if module_id
+            else super().get_num_episodes()
+        )
+    @override(EpisodeReplayBuffer)
+    def get_num_episodes_evicted(self, module_id: Optional[ModuleID] = None) -> int:
+        """Returns number of episodes evicted for a module in the buffer."""
+        return (
+            self._num_module_episodes_evicted[module_id]
+            if module_id
+            else super().get_num_episodes_evicted()
+        )
+    @override(EpisodeReplayBuffer)
+    def get_num_timesteps(self, module_id: Optional[ModuleID] = None) -> int:
+        """Returns number of individual timesteps for a module stored in the buffer.
+        Args:
+            module_id: The ID of the module to query. If not provided, the number of
+                timesteps for all modules are returned.
+        Returns:
+            The number of timesteps stored for the module or all modules.
+        """
+        return (
+            self._num_module_timesteps[module_id]
+            if module_id
+            else super().get_num_timesteps()
+        )
+    @override(EpisodeReplayBuffer)
+    def get_sampled_timesteps(self, module_id: Optional[ModuleID] = None) -> int:
+        """Returns number of timesteps that have been sampled for a module.
+        Args:
+            module_id: The ID of the module to query. If not provided, the number of
+                sampled timesteps for all modules are returned.
+        Returns:
+            The number of timesteps sampled for the module or all modules.
+        """
+        return (
+            self.sampled_timesteps_per_module[module_id]
+            if module_id
+            else super().get_sampled_timesteps()
+        )
+    @override(EpisodeReplayBuffer)
+    def get_added_timesteps(self, module_id: Optional[ModuleID] = None) -> int:
+        """Returns the number of timesteps added in buffer's lifetime for given module.
+        Args:
+            module_id: The ID of the module to query. If not provided, the total number
+                of timesteps ever added.
+        Returns:
+            The number of timesteps added for `module_id` (or all modules if `module_id`
+            is None).
+        """
+        return (
+            self._num_module_timesteps_added[module_id]
+            if module_id
+            else super().get_added_timesteps()
+        )
+    @override(EpisodeReplayBuffer)
+    def get_state(self) -> Dict[str, Any]:
+        """Gets a pickable state of the buffer.
+        This is used for checkpointing the buffer's state. It is specifically helpful,
+        for example, when a trial is paused and resumed later on. The buffer's state
+        can be saved to disk and reloaded when the trial is resumed.
+        Returns:
+            A dict containing all necessary information to restore the buffer's state.
+        """
+        return super().get_state() | {
+            "_module_to_indices": list(self._module_to_indices.items()),
+            "_num_agent_timesteps": self._num_agent_timesteps,
+            "_num_agent_timesteps_added": self._num_agent_timesteps_added,
+            "_num_module_timesteps": list(self._num_module_timesteps.items()),
+            "_num_module_timesteps_added": list(
+                self._num_module_timesteps_added.items()
+            ),
+            "_num_module_episodes": list(self._num_module_episodes.items()),
+            "_num_module_episodes_evicted": list(
+                self._num_module_episodes_evicted.items()
+            ),
+            "sampled_timesteps_per_module": list(
+                self.sampled_timesteps_per_module.items()
+            ),
+        }
+    @override(EpisodeReplayBuffer)
+    def set_state(self, state) -> None:
+        """Sets the state of a buffer from a previously stored state.
+        See `get_state()` for more information on what is stored in the state. This
+        method is used to restore the buffer's state from a previously stored state.
+        It is specifically helpful, for example, when a trial is paused and resumed
+        later on. The buffer's state can be saved to disk and reloaded when the trial
+        is resumed.
+        Args:
+            state: The state to restore the buffer from.
+        """
+        # Set the episodes.
+        self._set_episodes(state)
+        # Set the super's state.
+        super().set_state(state)
+        # Now set the remaining attributes.
+        self._module_to_indices = defaultdict(list, dict(state["_module_to_indices"]))
+        self._num_agent_timesteps = state["_num_agent_timesteps"]
+        self._num_agent_timesteps_added = state["_num_agent_timesteps_added"]
+        self._num_module_timesteps = defaultdict(
+            int, dict(state["_num_module_timesteps"])
+        )
+        self._num_module_timesteps_added = defaultdict(
+            int, dict(state["_num_module_timesteps_added"])
+        )
+        self._num_module_episodes = defaultdict(
+            int, dict(state["_num_module_episodes"])
+        )
+        self._num_module_episodes_evicted = defaultdict(
+            int, dict(state["_num_module_episodes_evicted"])
+        )
+        self.sampled_timesteps_per_module = defaultdict(
+            list, dict(state["sampled_timesteps_per_module"])
+        )
+    def _set_episodes(self, state: Dict[str, Any]) -> None:
+        """Sets the episodes from the state."""
+        if not self.episodes:
+            self.episodes = deque(
+                [
+                    MultiAgentEpisode.from_state(eps_data)
+                    for eps_data in state["episodes"]
+                ]
+            )
+    def _sample_independent(
+        self,
+        batch_size_B: Optional[int],
+        batch_length_T: Optional[int],
+        n_step: Optional[Union[int, Tuple[int, int]]],
+        gamma: float,
+        include_infos: bool,
+        include_extra_model_outputs: bool,
+        modules_to_sample: Optional[Set[ModuleID]],
+    ) -> List["SingleAgentEpisode"]:
+        """Samples a batch of independent multi-agent transitions."""
+        actual_n_step = n_step or 1
+        # Sample the n-step if necessary.
+        random_n_step = isinstance(n_step, (tuple, list))
+        sampled_episodes = []
+        # TODO (simon): Ensure that the module has data and if not, skip it.
+        #  TODO (sven): Should we then error out or skip? I think the Learner
+        #  should handle this case when a module has no train data.
+        modules_to_sample = modules_to_sample or set(self._module_to_indices.keys())
+        for module_id in modules_to_sample:
+            module_indices = self._module_to_indices[module_id]
+            B = 0
+            while B < batch_size_B:
+                # Now sample from the single-agent timesteps.
+                index_tuple = module_indices[self.rng.integers(len(module_indices))]
+                # This will be an agent timestep (not env timestep).
+                # TODO (simon, sven): Maybe deprecate sa_episode_idx (_) in the index
+                #   quads. Is there any need for it?
+                ma_episode_idx, agent_id, sa_episode_ts = (
+                    index_tuple[0] - self._num_episodes_evicted,
+                    index_tuple[1],
+                    index_tuple[2],
+                )
+                # Get the multi-agent episode.
+                ma_episode = self.episodes[ma_episode_idx]
+                # Retrieve the single-agent episode for filtering.
+                sa_episode = ma_episode.agent_episodes[agent_id]
+                # If we use random n-step sampling, draw the n-step for this item.
+                if random_n_step:
+                    actual_n_step = int(self.rng.integers(n_step[0], n_step[1]))
+                # If we cannnot make the n-step, we resample.
+                if sa_episode_ts + actual_n_step > len(sa_episode):
+                    continue
+                # Note, this will be the reward after executing action
+                # `a_(episode_ts)`. For `n_step>1` this will be the discounted sum
+                # of all rewards that were collected over the last n steps.
+                sa_raw_rewards = sa_episode.get_rewards(
+                    slice(sa_episode_ts, sa_episode_ts + actual_n_step)
+                )
+                sa_rewards = scipy.signal.lfilter(
+                    [1], [1, -gamma], sa_raw_rewards[::-1], axis=0
+                )[-1]
+                sampled_sa_episode = SingleAgentEpisode(
+                    id_=sa_episode.id_,
+                    # Provide the IDs for the learner connector.
+                    agent_id=sa_episode.agent_id,
+                    module_id=sa_episode.module_id,
+                    multi_agent_episode_id=ma_episode.id_,
+                    # Ensure that each episode contains a tuple of the form:
+                    #   (o_t, a_t, sum(r_(t:t+n_step)), o_(t+n_step))
+                    # Two observations (t and t+n).
+                    observations=[
+                        sa_episode.get_observations(sa_episode_ts),
+                        sa_episode.get_observations(sa_episode_ts + actual_n_step),
+                    ],
+                    observation_space=sa_episode.observation_space,
+                    infos=(
+                        [
+                            sa_episode.get_infos(sa_episode_ts),
+                            sa_episode.get_infos(sa_episode_ts + actual_n_step),
+                        ]
+                        if include_infos
+                        else None
+                    ),
+                    actions=[sa_episode.get_actions(sa_episode_ts)],
+                    action_space=sa_episode.action_space,
+                    rewards=[sa_rewards],
+                    # If the sampled single-agent episode is the single-agent episode's
+                    # last time step, check, if the single-agent episode is terminated
+                    # or truncated.
+                    terminated=(
+                        sa_episode_ts + actual_n_step >= len(sa_episode)
+                        and sa_episode.is_terminated
+                    ),
+                    truncated=(
+                        sa_episode_ts + actual_n_step >= len(sa_episode)
+                        and sa_episode.is_truncated
+                    ),
+                    extra_model_outputs={
+                        "weights": [1.0],
+                        "n_step": [actual_n_step],
+                        **(
+                            {
+                                k: [
+                                    sa_episode.get_extra_model_outputs(k, sa_episode_ts)
+                                ]
+                                for k in sa_episode.extra_model_outputs.keys()
+                            }
+                            if include_extra_model_outputs
+                            else {}
+                        ),
+                    },
+                    # TODO (sven): Support lookback buffers.
+                    len_lookback_buffer=0,
+                    t_started=sa_episode_ts,
+                )
+                # Append single-agent episode to the list of sampled episodes.
+                sampled_episodes.append(sampled_sa_episode)
+                # Increase counter.
+                B += 1
+            # Increase the per module timesteps counter.
+            self.sampled_timesteps_per_module[module_id] += B
+        # Increase the counter for environment timesteps.
+        self.sampled_timesteps += batch_size_B
+        # Return multi-agent dictionary.
+        return sampled_episodes
+    def _sample_synchonized(
+        self,
+        batch_size_B: Optional[int],
+        batch_length_T: Optional[int],
+        n_step: Optional[Union[int, Tuple]],
+        gamma: float,
+        include_infos: bool,
+        include_extra_model_outputs: bool,
+        modules_to_sample: Optional[List[ModuleID]],
+    ) -> SampleBatchType:
+        """Samples a batch of synchronized multi-agent transitions."""
+        # Sample the n-step if necessary.
+        if isinstance(n_step, tuple):
+            # Use random n-step sampling.
+            random_n_step = True
+        else:
+            actual_n_step = n_step or 1
+            random_n_step = False
+        # Containers for the sampled data.
+        observations: Dict[ModuleID, List[ObsType]] = defaultdict(list)
+        next_observations: Dict[ModuleID, List[ObsType]] = defaultdict(list)
+        actions: Dict[ModuleID, List[ActType]] = defaultdict(list)
+        rewards: Dict[ModuleID, List[float]] = defaultdict(list)
+        is_terminated: Dict[ModuleID, List[bool]] = defaultdict(list)
+        is_truncated: Dict[ModuleID, List[bool]] = defaultdict(list)
+        weights: Dict[ModuleID, List[float]] = defaultdict(list)
+        n_steps: Dict[ModuleID, List[int]] = defaultdict(list)
+        # If `info` should be included, construct also a container for them.
+        if include_infos:
+            infos: Dict[ModuleID, List[Dict[str, Any]]] = defaultdict(list)
+        # If `extra_model_outputs` should be included, construct a container for them.
+        if include_extra_model_outputs:
+            extra_model_outputs: Dict[ModuleID, List[Dict[str, Any]]] = defaultdict(
+                list
+            )
+        B = 0
+        while B < batch_size_B:
+            index_tuple = self._indices[self.rng.integers(len(self._indices))]
+            # This will be an env timestep (not agent timestep)
+            ma_episode_idx, ma_episode_ts = (
+                index_tuple[0] - self._num_episodes_evicted,
+                index_tuple[1],
+            )
+            # If we use random n-step sampling, draw the n-step for this item.
+            if random_n_step:
+                actual_n_step = int(self.rng.integers(n_step[0], n_step[1]))
+            # If we are at the end of an episode, continue.
+            # Note, priority sampling got us `o_(t+n)` and we need for the loss
+            # calculation in addition `o_t`.
+            # TODO (simon): Maybe introduce a variable `num_retries` until the
+            # while loop should break when not enough samples have been collected
+            # to make n-step possible.
+            if ma_episode_ts - actual_n_step < 0:
+                continue
+            # Retrieve the multi-agent episode.
+            ma_episode = self.episodes[ma_episode_idx]
+            # Ensure that each row contains a tuple of the form:
+            #   (o_t, a_t, sum(r_(t:t+n_step)), o_(t+n_step))
+            # TODO (simon): Implement version for sequence sampling when using RNNs.
+            eps_observation = ma_episode.get_observations(
+                slice(ma_episode_ts - actual_n_step, ma_episode_ts + 1),
+                return_list=True,
+            )
+            # Note, `MultiAgentEpisode` stores the action that followed
+            # `o_t` with `o_(t+1)`, therefore, we need the next one.
+            # TODO (simon): This gets the wrong action as long as the getters are not
+            # fixed.
+            eps_actions = ma_episode.get_actions(ma_episode_ts - actual_n_step)
+            # Make sure that at least a single agent should have full transition.
+            # TODO (simon): Filter for the `modules_to_sample`.
+            agents_to_sample = self._agents_with_full_transitions(
+                eps_observation,
+                eps_actions,
+            )
+            # If not, we resample.
+            if not agents_to_sample:
+                continue
+            # TODO (simon, sven): Do we need to include the common agent rewards?
+            # Note, the reward that is collected by transitioning from `o_t` to
+            # `o_(t+1)` is stored in the next transition in `MultiAgentEpisode`.
+            eps_rewards = ma_episode.get_rewards(
+                slice(ma_episode_ts - actual_n_step, ma_episode_ts),
+                return_list=True,
+            )
+            # TODO (simon, sven): Do we need to include the common infos? And are
+            # there common extra model outputs?
+            if include_infos:
+                # If infos are included we include the ones from the last timestep
+                # as usually the info contains additional values about the last state.
+                eps_infos = ma_episode.get_infos(ma_episode_ts)
+            if include_extra_model_outputs:
+                # If `extra_model_outputs` are included we include the ones from the
+                # first timestep as usually the `extra_model_outputs` contain additional
+                # values from the forward pass that produced the action at the first
+                # timestep.
+                # Note, we extract them into single row dictionaries similar to the
+                # infos, in a connector we can then extract these into single batch
+                # rows.
+                eps_extra_model_outputs = {
+                    k: ma_episode.get_extra_model_outputs(
+                        k, ma_episode_ts - actual_n_step
+                    )
+                    for k in ma_episode.extra_model_outputs.keys()
+                }
+            # If the sampled time step is the episode's last time step check, if
+            # the episode is terminated or truncated.
+            episode_terminated = False
+            episode_truncated = False
+            if ma_episode_ts == ma_episode.env_t:
+                episode_terminated = ma_episode.is_terminated
+                episode_truncated = ma_episode.is_truncated
+            # TODO (simon): Filter for the `modules_to_sample`.
+            # TODO (sven, simon): We could here also sample for all agents in the
+            # `modules_to_sample` and then adapt the `n_step` for agents that
+            # have not a full transition.
+            for agent_id in agents_to_sample:
+                # Map our agent to the corresponding module we want to
+                # train.
+                module_id = ma_episode._agent_to_module_mapping[agent_id]
+                # Sample only for the modules in `modules_to_sample`.
+                if module_id not in (
+                    modules_to_sample or self._module_to_indices.keys()
+                ):
+                    continue
+                # TODO (simon, sven): Here we could skip for modules not
+                # to be sampled in `modules_to_sample`.
+                observations[module_id].append(eps_observation[0][agent_id])
+                next_observations[module_id].append(eps_observation[-1][agent_id])
+                # Fill missing rewards with zeros.
+                agent_rewards = [r[agent_id] or 0.0 for r in eps_rewards]
+                rewards[module_id].append(
+                    scipy.signal.lfilter([1], [1, -gamma], agent_rewards[::-1], axis=0)[
+                        -1
+                    ]
+                )
+                # Note, this should exist, as we filtered for agents with full
+                # transitions.
+                actions[module_id].append(eps_actions[agent_id])
+                if include_infos:
+                    infos[module_id].append(eps_infos[agent_id])
+                if include_extra_model_outputs:
+                    extra_model_outputs[module_id].append(
+                        {
+                            k: eps_extra_model_outputs[agent_id][k]
+                            for k in eps_extra_model_outputs[agent_id].keys()
+                        }
+                    )
+                # If sampled observation is terminal for the agent. Either MAE
+                # episode is truncated/terminated or SAE episode is truncated/
+                # terminated at this ts.
+                # TODO (simon, sven): Add method agent_alive(ts) to MAE.
+                # or add slicing to get_terminateds().
+                agent_ts = ma_episode.env_t_to_agent_t[agent_id][ma_episode_ts]
+                agent_eps = ma_episode.agent_episodes[agent_id]
+                agent_terminated = agent_ts == agent_eps.t and agent_eps.is_terminated
+                agent_truncated = (
+                    agent_ts == agent_eps.t
+                    and agent_eps.is_truncated
+                    and not agent_eps.is_terminated
+                )
+                if episode_terminated or agent_terminated:
+                    is_terminated[module_id].append(True)
+                    is_truncated[module_id].append(False)
+                elif episode_truncated or agent_truncated:
+                    is_truncated[module_id].append(True)
+                    is_terminated[module_id].append(False)
+                else:
+                    is_terminated[module_id].append(False)
+                    is_truncated[module_id].append(False)
+                # Increase the per module counter.
+                self.sampled_timesteps_per_module[module_id] += 1
+            # Increase counter.
+            B += 1
+        # Increase the counter for environment timesteps.
+        self.sampled_timesteps += batch_size_B
+        # Should be convertible to MultiAgentBatch.
+        ret = {
+            **{
+                module_id: {
+                    Columns.OBS: batch(observations[module_id]),
+                    Columns.ACTIONS: batch(actions[module_id]),
+                    Columns.REWARDS: np.array(rewards[module_id]),
+                    Columns.NEXT_OBS: batch(next_observations[module_id]),
+                    Columns.TERMINATEDS: np.array(is_terminated[module_id]),
+                    Columns.TRUNCATEDS: np.array(is_truncated[module_id]),
+                    "weights": np.array(weights[module_id]),
+                    "n_step": np.array(n_steps[module_id]),
+                }
+                for module_id in observations.keys()
+            }
+        }
+        # Return multi-agent dictionary.
+        return ret
+    def _num_remaining_episodes(self, new_eps, evicted_eps):
+        """Calculates the number of remaining episodes.
+        When adding episodes and evicting them in the `add()` method
+        this function calculates iteratively the number of remaining
+        episodes.
+        Args:
+            new_eps: List of new episode IDs.
+            evicted_eps: List of evicted episode IDs.
+        Returns:
+            Number of episodes remaining after evicting the episodes in
+            `evicted_eps` and adding the episode in `new_eps`.
+        """
+        return len(
+            set(self.episode_id_to_index.keys()).union(set(new_eps)) - set(evicted_eps)
+        )
+    def _evict_module_episodes(self, ma_episode: MultiAgentEpisode) -> None:
+        """Evicts the module episodes from the buffer adn updates all counters.
+        Args:
+            multi_agent_eps: The multi-agent episode to evict from the buffer.
+        """
+        # Note we need to take the agent ids from the evicted episode because
+        # different episodes can have different agents and module mappings.
+        for agent_id in ma_episode.agent_episodes:
+            # Retrieve the corresponding module ID and module episode.
+            module_id = ma_episode._agent_to_module_mapping[agent_id]
+            module_eps = ma_episode.agent_episodes[agent_id]
+            # Update all counters.
+            self._num_module_timesteps[module_id] -= module_eps.env_steps()
+            self._num_module_episodes[module_id] -= 1
+            self._num_module_episodes_evicted[module_id] += 1
+    def _update_module_counters(self, ma_episode: MultiAgentEpisode) -> None:
+        """Updates the module counters after adding an episode.
+        Args:
+            multi_agent_episode: The multi-agent episode to update the module counters
+                for.
+        """
+        for agent_id in ma_episode.agent_ids:
+            agent_steps = ma_episode.agent_episodes[agent_id].env_steps()
+            # Only add if the agent has stepped in the episode (chunk).
+            if agent_steps > 0:
+                # Receive the corresponding module ID.
+                module_id = ma_episode.module_for(agent_id)
+                self._num_module_timesteps[module_id] += agent_steps
+                self._num_module_timesteps_added[module_id] += agent_steps
+                # if ma_episode.agent_episodes[agent_id].is_done:
+                #     # TODO (simon): Check, if we do not count the same episode
+                #     # multiple times.
+                #     # Also add to the module episode counter.
+                #     self._num_module_episodes[module_id] += 1
+    def _add_new_module_indices(
+        self,
+        ma_episode: MultiAgentEpisode,
+        episode_idx: int,
+        ma_episode_exists: bool = True,
+    ) -> None:
+        """Adds the module indices for new episode chunks.
+        Args:
+            ma_episode: The multi-agent episode to add the module indices for.
+            episode_idx: The index of the episode in the `self.episodes`.
+            ma_episode_exists: Whether `ma_episode` is already in this buffer (with a
+                predecessor chunk to which we'll concatenate `ma_episode` later).
+        """
+        existing_ma_episode = None
+        if ma_episode_exists:
+            existing_ma_episode = self.episodes[
+                self.episode_id_to_index[ma_episode.id_] - self._num_episodes_evicted
+            ]
+        # Note, we iterate through the agent episodes b/c we want to store records
+        # and some agents could not have entered the environment.
+        for agent_id in ma_episode.agent_episodes:
+            # Get the corresponding module id.
+            module_id = ma_episode.module_for(agent_id)
+            # Get the module episode.
+            module_eps = ma_episode.agent_episodes[agent_id]
+            # Is the agent episode already in the buffer's existing `ma_episode`?
+            if ma_episode_exists and agent_id in existing_ma_episode.agent_episodes:
+                existing_sa_eps_len = len(existing_ma_episode.agent_episodes[agent_id])
+            # Otherwise, it is a new single-agent episode and we increase the counter.
+            else:
+                existing_sa_eps_len = 0
+                self._num_module_episodes[module_id] += 1
+            # Add new module indices.
+            self._module_to_indices[module_id].extend(
+                [
+                    (
+                        # Keep the MAE index for sampling
+                        episode_idx,
+                        agent_id,
+                        existing_sa_eps_len + i,
+                    )
+                    for i in range(len(module_eps))
+                ]
+            )
+    def _agents_with_full_transitions(
+        self, observations: Dict[AgentID, ObsType], actions: Dict[AgentID, ActType]
+    ):
+        """Filters for agents that have full transitions.
+        Args:
+            observations: The observations of the episode.
+            actions: The actions of the episode.
+        Returns:
+            List of agent IDs that have full transitions.
+        """
+        agents_to_sample = []
+        for agent_id in observations[0].keys():
+            # Only if the agent has an action at the first and an observation
+            # at the first and last timestep of the n-step transition, we can sample it.
+            if agent_id in actions and agent_id in observations[-1]:
+                agents_to_sample.append(agent_id)
+        return agents_to_sample

.venv/lib/python3.11/site-packages/ray/rllib/utils/replay_buffers/multi_agent_mixin_replay_buffer.py ADDED Viewed

	@@ -0,0 +1,404 @@

+import collections
+import logging
+import random
+from typing import Any, Dict, Optional
+import numpy as np
+from ray.rllib.policy.rnn_sequencing import timeslice_along_seq_lens_with_overlap
+from ray.rllib.policy.sample_batch import (
+    DEFAULT_POLICY_ID,
+    SampleBatch,
+    concat_samples_into_ma_batch,
+)
+from ray.rllib.utils.annotations import override
+from ray.rllib.utils.replay_buffers.multi_agent_prioritized_replay_buffer import (
+    MultiAgentPrioritizedReplayBuffer,
+)
+from ray.rllib.utils.replay_buffers.multi_agent_replay_buffer import (
+    MultiAgentReplayBuffer,
+    ReplayMode,
+    merge_dicts_with_warning,
+)
+from ray.rllib.utils.replay_buffers.replay_buffer import _ALL_POLICIES, StorageUnit
+from ray.rllib.utils.typing import PolicyID, SampleBatchType
+from ray.util.annotations import DeveloperAPI
+from ray.util.debug import log_once
+logger = logging.getLogger(__name__)
+@DeveloperAPI
+class MultiAgentMixInReplayBuffer(MultiAgentPrioritizedReplayBuffer):
+    """This buffer adds replayed samples to a stream of new experiences.
+    - Any newly added batch (`add()`) is immediately returned upon
+    the next `sample` call (close to on-policy) as well as being moved
+    into the buffer.
+    - Additionally, a certain number of old samples is mixed into the
+    returned sample according to a given "replay ratio".
+    - If >1 calls to `add()` are made without any `sample()` calls
+    in between, all newly added batches are returned (plus some older samples
+    according to the "replay ratio").
+    .. testcode::
+        :skipif: True
+        # replay ratio 0.66 (2/3 replayed, 1/3 new samples):
+        buffer = MultiAgentMixInReplayBuffer(capacity=100,
+                                             replay_ratio=0.66)
+        buffer.add(<A>)
+        buffer.add(<B>)
+        buffer.sample(1)
+    .. testoutput::
+        ..[<A>, <B>, <B>]
+    .. testcode::
+        :skipif: True
+        buffer.add(<C>)
+        buffer.sample(1)
+    .. testoutput::
+        [<C>, <A>, <B>]
+        or: [<C>, <A>, <A>], [<C>, <B>, <A>] or [<C>, <B>, <B>],
+        but always <C> as it is the newest sample
+    .. testcode::
+        :skipif: True
+        buffer.add(<D>)
+        buffer.sample(1)
+    .. testoutput::
+        [<D>, <A>, <C>]
+        or [<D>, <A>, <A>], [<D>, <B>, <A>] or [<D>, <B>, <C>], etc..
+        but always <D> as it is the newest sample
+    .. testcode::
+        :skipif: True
+        # replay proportion 0.0 -> replay disabled:
+        buffer = MixInReplay(capacity=100, replay_ratio=0.0)
+        buffer.add(<A>)
+        buffer.sample()
+    .. testoutput::
+        [<A>]
+    .. testcode::
+        :skipif: True
+        buffer.add(<B>)
+        buffer.sample()
+    .. testoutput::
+        [<B>]
+    """
+    def __init__(
+        self,
+        capacity: int = 10000,
+        storage_unit: str = "timesteps",
+        num_shards: int = 1,
+        replay_mode: str = "independent",
+        replay_sequence_override: bool = True,
+        replay_sequence_length: int = 1,
+        replay_burn_in: int = 0,
+        replay_zero_init_states: bool = True,
+        replay_ratio: float = 0.66,
+        underlying_buffer_config: dict = None,
+        prioritized_replay_alpha: float = 0.6,
+        prioritized_replay_beta: float = 0.4,
+        prioritized_replay_eps: float = 1e-6,
+        **kwargs
+    ):
+        """Initializes MultiAgentMixInReplayBuffer instance.
+        Args:
+            capacity: The capacity of the buffer, measured in `storage_unit`.
+            storage_unit: Either 'timesteps', 'sequences' or
+                'episodes'. Specifies how experiences are stored. If they
+                are stored in episodes, replay_sequence_length is ignored.
+            num_shards: The number of buffer shards that exist in total
+                (including this one).
+            replay_mode: One of "independent" or "lockstep". Determines,
+                whether batches are sampled independently or to an equal
+                amount.
+            replay_sequence_override: If True, ignore sequences found in incoming
+                batches, slicing them into sequences as specified by
+                `replay_sequence_length` and `replay_sequence_burn_in`. This only has
+                an effect if storage_unit is `sequences`.
+            replay_sequence_length: The sequence length (T) of a single
+                sample. If > 1, we will sample B x T from this buffer. This
+                only has an effect if storage_unit is 'timesteps'.
+            replay_burn_in: The burn-in length in case
+                `replay_sequence_length` > 0. This is the number of timesteps
+                each sequence overlaps with the previous one to generate a
+                better internal state (=state after the burn-in), instead of
+                starting from 0.0 each RNN rollout.
+            replay_zero_init_states: Whether the initial states in the
+                buffer (if replay_sequence_length > 0) are alwayas 0.0 or
+                should be updated with the previous train_batch state outputs.
+            replay_ratio: Ratio of replayed samples in the returned
+                batches. E.g. a ratio of 0.0 means only return new samples
+                (no replay), a ratio of 0.5 means always return newest sample
+                plus one old one (1:1), a ratio of 0.66 means always return
+                the newest sample plus 2 old (replayed) ones (1:2), etc...
+            underlying_buffer_config: A config that contains all necessary
+                constructor arguments and arguments for methods to call on
+                the underlying buffers. This replaces the standard behaviour
+                of the underlying PrioritizedReplayBuffer. The config
+                follows the conventions of the general
+                replay_buffer_config. kwargs for subsequent calls of methods
+                may also be included. Example:
+                "replay_buffer_config": {"type": PrioritizedReplayBuffer,
+                "capacity": 10, "storage_unit": "timesteps",
+                prioritized_replay_alpha: 0.5, prioritized_replay_beta: 0.5,
+                prioritized_replay_eps: 0.5}
+            prioritized_replay_alpha: Alpha parameter for a prioritized
+                replay buffer. Use 0.0 for no prioritization.
+            prioritized_replay_beta: Beta parameter for a prioritized
+                replay buffer.
+            prioritized_replay_eps: Epsilon parameter for a prioritized
+                replay buffer.
+            **kwargs: Forward compatibility kwargs.
+        """
+        if not 0 <= replay_ratio <= 1:
+            raise ValueError("Replay ratio must be within [0, 1]")
+        MultiAgentPrioritizedReplayBuffer.__init__(
+            self,
+            capacity=capacity,
+            storage_unit=storage_unit,
+            num_shards=num_shards,
+            replay_mode=replay_mode,
+            replay_sequence_override=replay_sequence_override,
+            replay_sequence_length=replay_sequence_length,
+            replay_burn_in=replay_burn_in,
+            replay_zero_init_states=replay_zero_init_states,
+            underlying_buffer_config=underlying_buffer_config,
+            prioritized_replay_alpha=prioritized_replay_alpha,
+            prioritized_replay_beta=prioritized_replay_beta,
+            prioritized_replay_eps=prioritized_replay_eps,
+            **kwargs
+        )
+        self.replay_ratio = replay_ratio
+        self.last_added_batches = collections.defaultdict(list)
+    @DeveloperAPI
+    @override(MultiAgentPrioritizedReplayBuffer)
+    def add(self, batch: SampleBatchType, **kwargs) -> None:
+        """Adds a batch to the appropriate policy's replay buffer.
+        Turns the batch into a MultiAgentBatch of the DEFAULT_POLICY_ID if
+        it is not a MultiAgentBatch. Subsequently, adds the individual policy
+        batches to the storage.
+        Args:
+            batch: The batch to be added.
+            **kwargs: Forward compatibility kwargs.
+        """
+        # Make a copy so the replay buffer doesn't pin plasma memory.
+        batch = batch.copy()
+        # Handle everything as if multi-agent.
+        batch = batch.as_multi_agent()
+        kwargs = merge_dicts_with_warning(self.underlying_buffer_call_args, kwargs)
+        pids_and_batches = self._maybe_split_into_policy_batches(batch)
+        # We need to split batches into timesteps, sequences or episodes
+        # here already to properly keep track of self.last_added_batches
+        # underlying buffers should not split up the batch any further
+        with self.add_batch_timer:
+            if self.storage_unit == StorageUnit.TIMESTEPS:
+                for policy_id, sample_batch in pids_and_batches.items():
+                    timeslices = sample_batch.timeslices(1)
+                    for time_slice in timeslices:
+                        self.replay_buffers[policy_id].add(time_slice, **kwargs)
+                        self.last_added_batches[policy_id].append(time_slice)
+            elif self.storage_unit == StorageUnit.SEQUENCES:
+                for policy_id, sample_batch in pids_and_batches.items():
+                    timeslices = timeslice_along_seq_lens_with_overlap(
+                        sample_batch=sample_batch,
+                        seq_lens=sample_batch.get(SampleBatch.SEQ_LENS)
+                        if self.replay_sequence_override
+                        else None,
+                        zero_pad_max_seq_len=self.replay_sequence_length,
+                        pre_overlap=self.replay_burn_in,
+                        zero_init_states=self.replay_zero_init_states,
+                    )
+                    for slice in timeslices:
+                        self.replay_buffers[policy_id].add(slice, **kwargs)
+                        self.last_added_batches[policy_id].append(slice)
+            elif self.storage_unit == StorageUnit.EPISODES:
+                for policy_id, sample_batch in pids_and_batches.items():
+                    for eps in sample_batch.split_by_episode():
+                        # Only add full episodes to the buffer
+                        if eps.get(SampleBatch.T)[0] == 0 and (
+                            eps.get(SampleBatch.TERMINATEDS, [True])[-1]
+                            or eps.get(SampleBatch.TRUNCATEDS, [False])[-1]
+                        ):
+                            self.replay_buffers[policy_id].add(eps, **kwargs)
+                            self.last_added_batches[policy_id].append(eps)
+                        else:
+                            if log_once("only_full_episodes"):
+                                logger.info(
+                                    "This buffer uses episodes as a storage "
+                                    "unit and thus allows only full episodes "
+                                    "to be added to it. Some samples may be "
+                                    "dropped."
+                                )
+            elif self.storage_unit == StorageUnit.FRAGMENTS:
+                for policy_id, sample_batch in pids_and_batches.items():
+                    self.replay_buffers[policy_id].add(sample_batch, **kwargs)
+                    self.last_added_batches[policy_id].append(sample_batch)
+        self._num_added += batch.count
+    @DeveloperAPI
+    @override(MultiAgentReplayBuffer)
+    def sample(
+        self, num_items: int, policy_id: PolicyID = DEFAULT_POLICY_ID, **kwargs
+    ) -> Optional[SampleBatchType]:
+        """Samples a batch of size `num_items` from a specified buffer.
+        Concatenates old samples to new ones according to
+        self.replay_ratio. If not enough new samples are available, mixes in
+        less old samples to retain self.replay_ratio on average. Returns
+        an empty batch if there are no items in the buffer.
+        Args:
+            num_items: Number of items to sample from this buffer.
+            policy_id: ID of the policy that produced the experiences to be
+            sampled.
+            **kwargs: Forward compatibility kwargs.
+        Returns:
+            Concatenated MultiAgentBatch of items.
+        """
+        # Merge kwargs, overwriting standard call arguments
+        kwargs = merge_dicts_with_warning(self.underlying_buffer_call_args, kwargs)
+        def mix_batches(_policy_id):
+            """Mixes old with new samples.
+            Tries to mix according to self.replay_ratio on average.
+            If not enough new samples are available, mixes in less old samples
+            to retain self.replay_ratio on average.
+            """
+            def round_up_or_down(value, ratio):
+                """Returns an integer averaging to value*ratio."""
+                product = value * ratio
+                ceil_prob = product % 1
+                if random.uniform(0, 1) < ceil_prob:
+                    return int(np.ceil(product))
+                else:
+                    return int(np.floor(product))
+            max_num_new = round_up_or_down(num_items, 1 - self.replay_ratio)
+            # if num_samples * self.replay_ratio is not round,
+            # we need one more sample with a probability of
+            # (num_items*self.replay_ratio) % 1
+            _buffer = self.replay_buffers[_policy_id]
+            output_batches = self.last_added_batches[_policy_id][:max_num_new]
+            self.last_added_batches[_policy_id] = self.last_added_batches[_policy_id][
+                max_num_new:
+            ]
+            # No replay desired
+            if self.replay_ratio == 0.0:
+                return concat_samples_into_ma_batch(output_batches)
+            # Only replay desired
+            elif self.replay_ratio == 1.0:
+                return _buffer.sample(num_items, **kwargs)
+            num_new = len(output_batches)
+            if np.isclose(num_new, num_items * (1 - self.replay_ratio)):
+                # The optimal case, we can mix in a round number of old
+                # samples on average
+                num_old = num_items - max_num_new
+            else:
+                # We never want to return more elements than num_items
+                num_old = min(
+                    num_items - max_num_new,
+                    round_up_or_down(
+                        num_new, self.replay_ratio / (1 - self.replay_ratio)
+                    ),
+                )
+            output_batches.append(_buffer.sample(num_old, **kwargs))
+            # Depending on the implementation of underlying buffers, samples
+            # might be SampleBatches
+            output_batches = [batch.as_multi_agent() for batch in output_batches]
+            return concat_samples_into_ma_batch(output_batches)
+        def check_buffer_is_ready(_policy_id):
+            if (
+                (len(self.replay_buffers[policy_id]) == 0) and self.replay_ratio > 0.0
+            ) or (
+                len(self.last_added_batches[_policy_id]) == 0
+                and self.replay_ratio < 1.0
+            ):
+                return False
+            return True
+        with self.replay_timer:
+            samples = []
+            if self.replay_mode == ReplayMode.LOCKSTEP:
+                assert (
+                    policy_id is None
+                ), "`policy_id` specifier not allowed in `lockstep` mode!"
+                if check_buffer_is_ready(_ALL_POLICIES):
+                    samples.append(mix_batches(_ALL_POLICIES).as_multi_agent())
+            elif policy_id is not None:
+                if check_buffer_is_ready(policy_id):
+                    samples.append(mix_batches(policy_id).as_multi_agent())
+            else:
+                for policy_id, replay_buffer in self.replay_buffers.items():
+                    if check_buffer_is_ready(policy_id):
+                        samples.append(mix_batches(policy_id).as_multi_agent())
+            return concat_samples_into_ma_batch(samples)
+    @DeveloperAPI
+    @override(MultiAgentPrioritizedReplayBuffer)
+    def get_state(self) -> Dict[str, Any]:
+        """Returns all local state.
+        Returns:
+            The serializable local state.
+        """
+        data = {
+            "last_added_batches": self.last_added_batches,
+        }
+        parent = MultiAgentPrioritizedReplayBuffer.get_state(self)
+        parent.update(data)
+        return parent
+    @DeveloperAPI
+    @override(MultiAgentPrioritizedReplayBuffer)
+    def set_state(self, state: Dict[str, Any]) -> None:
+        """Restores all local state to the provided `state`.
+        Args:
+            state: The new state to set this buffer. Can be obtained by
+                calling `self.get_state()`.
+        """
+        self.last_added_batches = state["last_added_batches"]
+        MultiAgentPrioritizedReplayBuffer.set_state(state)