diff --git "a/sactor_critic_algo.py" "b/sactor_critic_algo.py" deleted file mode 100644--- "a/sactor_critic_algo.py" +++ /dev/null @@ -1,2391 +0,0 @@ -from aug_mpc.agents.sactor_critic.sac import SACAgent -from aug_mpc.agents.dummies.dummy import DummyAgent - -from aug_mpc.utils.shared_data.algo_infos import SharedRLAlgorithmInfo, QfVal, QfTrgt -from aug_mpc.utils.shared_data.training_env import SubReturns, TotReturns -from aug_mpc.utils.nn.rnd import RNDFull - -import torch -import torch.optim as optim -import torch.nn as nn - -import random -import math -from typing import Dict - -import os -import shutil - -import time - -import wandb -import h5py -import numpy as np - -from EigenIPC.PyEigenIPC import LogType -from EigenIPC.PyEigenIPC import Journal -from EigenIPC.PyEigenIPC import VLevel - -from abc import ABC, abstractmethod - -class SActorCriticAlgoBase(ABC): - - # base class for actor-critic RL algorithms - - def __init__(self, - env, - debug = False, - remote_db = False, - seed: int = 1): - - self._env = env - self._seed = seed - - self._eval = False - self._det_eval = True - - self._full_env_db=False - - self._agent = None - - self._debug = debug - self._remote_db = remote_db - - self._writer = None - - self._run_name = None - self._drop_dir = None - self._dbinfo_drop_fname = None - self._model_path = None - - self._policy_update_db_data_dict = {} - self._custom_env_data_db_dict = {} - self._rnd_db_data_dict = {} - self._hyperparameters = {} - self._wandb_d={} - - # get params from env - self._get_params_from_env() - - self._torch_device = torch.device("cpu") # defaults to cpu - - self._setup_done = False - - self._verbose = False - - self._is_done = False - - self._shared_algo_data = None - - self._this_child_path = None - self._this_basepath = os.path.abspath(__file__) - - def __del__(self): - - self.done() - - def _get_params_from_env(self): - - self._env_name = self._env.name() - self._episodic_reward_metrics = self._env.ep_rewards_metrics() - self._use_gpu = self._env.using_gpu() - self._dtype = self._env.dtype() - self._env_opts=self._env.env_opts() - self._num_envs = self._env.n_envs() - self._obs_dim = self._env.obs_dim() - self._actions_dim = self._env.actions_dim() - self._episode_timeout_lb, self._episode_timeout_ub = self._env.episode_timeout_bounds() - self._task_rand_timeout_lb, self._task_rand_timeout_ub = self._env.task_rand_timeout_bounds() - self._env_n_action_reps = self._env.n_action_reps() - self._is_continuous_actions_bool=self._env.is_action_continuous() - self._is_continuous_actions=torch.where(self._is_continuous_actions_bool)[0] - self._is_discrete_actions_bool=self._env.is_action_discrete() - self._is_discrete_actions=torch.where(self._is_discrete_actions_bool)[0] - - # default to all debug envs - self._db_env_selector=torch.tensor(list(range(0, self._num_envs)), dtype=torch.int) - self._db_env_selector_bool=torch.full((self._num_envs, ), - dtype=torch.bool, device="cpu", - fill_value=True) - # default to no expl envs - self._expl_env_selector=None - self._expl_env_selector_bool=torch.full((self._num_envs, ), dtype=torch.bool, device="cpu", - fill_value=False) - self._pert_counter=0.0 - # demo envs - self._demo_stop_thresh=None # performance metrics above which demo envs are deactivated - # (can be overridden thorugh the provided options) - self._demo_env_selector=self._env.demo_env_idxs() - self._demo_env_selector_bool=self._env.demo_env_idxs(get_bool=True) - - def learn(self): - - if not self._setup_done: - self._should_have_called_setup() - - self._start_time = time.perf_counter() - - # experience collection - with torch.no_grad(): # don't need grad computation here - for i in range(self._collection_freq): - if not self._collect_transition(): - return False - self._vec_transition_counter+=1 - - self._collection_t = time.perf_counter() - - if self._vec_transition_counter % self._bnorm_vecfreq == 0: - with torch.no_grad(): # don't need grad computation here - self._update_batch_norm(bsize=self._bnorm_bsize) - - # policy update - self._policy_update_t_start = time.perf_counter() - for i in range(self._update_freq): - self._update_policy() - self._update_counter+=1 - - self._policy_update_t = time.perf_counter() - - with torch.no_grad(): - if self._validate and (self._vec_transition_counter % self._validation_db_vecstep_freq == 0): - # validation - self._update_validation_losses() - self._validation_t = time.perf_counter() - self._post_step() - - if self._use_period_resets: - # periodic policy resets - if not self._adaptive_resets: - self._periodic_resets_on=(self._vec_transition_counter >= self._reset_vecstep_start) and \ - (self._vec_transition_counter < self._reset_vecstep_end) - - if self._periodic_resets_on and \ - (self._vec_transition_counter-self._reset_vecstep_start) % self._periodic_resets_vecfreq == 0: - self._reset_agent() - else: # trigger reset based on overfit metric - if self._overfit_idx > self._overfit_idx_thresh: - self._reset_agent() - - return not self.is_done() - - def eval(self): - - if not self._setup_done: - self._should_have_called_setup() - - self._start_time = time.perf_counter() - - if not self._collect_eval_transition(): - return False - self._vec_transition_counter+=1 - - self._collection_t = time.perf_counter() - - self._post_step() - - return not self.is_done() - - @abstractmethod - def _collect_transition(self)->bool: - pass - - @abstractmethod - def _collect_eval_transition(self)->bool: - pass - - @abstractmethod - def _update_policy(self): - pass - - @abstractmethod - def _update_validation_losses(self): - pass - - def _update_overfit_idx(self, loss, val_loss): - overfit_now=(val_loss-loss)/loss - self._overfit_idx=self._overfit_idx_alpha*overfit_now+\ - (1-self._overfit_idx_alpha)*self._overfit_idx - - def setup(self, - run_name: str, - ns: str, - custom_args: Dict = {}, - verbose: bool = False, - drop_dir_name: str = None, - eval: bool = False, - resume: bool = False, - model_path: str = None, - n_eval_timesteps: int = None, - comment: str = "", - dump_checkpoints: bool = False, - norm_obs: bool = True, - rescale_obs: bool = False): - - tot_tsteps=int(100e6) - if "tot_tsteps" in custom_args: - tot_tsteps=custom_args["tot_tsteps"] - - self._verbose = verbose - - self._ns=ns # only used for shared mem stuff - - self._dump_checkpoints = dump_checkpoints - - self._init_algo_shared_data(static_params=self._hyperparameters) # can only handle dicts with - # numeric values - - if "full_env_db" in custom_args: - self._full_env_db=custom_args["full_env_db"] - - self._eval = eval - self._resume=resume - if self._eval and self._resume: - Journal.log(self.__class__.__name__, - "setup", - f"Cannot set both eval and resume to true. Exiting.", - LogType.EXCEP, - throw_when_excep = True) - - self._load_qf=False - if self._eval: - if "load_qf" in custom_args: - self._load_qf=custom_args["load_qf"] - if self._resume: - self._load_qf=True # must load qf when resuming - self._eval=False - try: - self._det_eval=custom_args["det_eval"] - except: - pass - - self._override_agent_actions=False - if "override_agent_actions" in custom_args: - self._override_agent_actions=custom_args["override_agent_actions"] - - if self._override_agent_actions: # force evaluation mode - Journal.log(self.__class__.__name__, - "setup", - "will force evaluation mode since override_agent_actions was set to true", - LogType.INFO, - throw_when_excep = True) - self._eval=True - self._validate=False - self._load_qf=False - self._det_eval=False - self._resume=False - - self._run_name = run_name - from datetime import datetime - self._time_id = datetime.now().strftime('d%Y_%m_%d_h%H_m%M_s%S') - self._unique_id = self._time_id + "-" + self._run_name - - self._hyperparameters["unique_run_id"]=self._unique_id - self._hyperparameters.update(custom_args) - - self._torch_device = torch.device("cuda" if torch.cuda.is_available() and self._use_gpu else "cpu") - - try: - layer_width_actor=self._hyperparameters["actor_lwidth"] - layer_width_critic=self._hyperparameters["critic_lwidth"] - n_hidden_layers_actor=self._hyperparameters["actor_n_hlayers"] - n_hidden_layers_critic=self._hyperparameters["critic_n_hlayers"] - except: - layer_width_actor=256 - layer_width_critic=512 - n_hidden_layers_actor=2 - n_hidden_layers_critic=4 - pass - - use_torch_compile=False - add_weight_norm=False - add_layer_norm=False - add_batch_norm=False - compression_ratio=-1.0 - if "use_torch_compile" in self._hyperparameters and \ - self._hyperparameters["use_torch_compile"]: - use_torch_compile=True - if "add_weight_norm" in self._hyperparameters and \ - self._hyperparameters["add_weight_norm"]: - add_weight_norm=True - if "add_layer_norm" in self._hyperparameters and \ - self._hyperparameters["add_layer_norm"]: - add_layer_norm=True - if "add_batch_norm" in self._hyperparameters and \ - self._hyperparameters["add_batch_norm"]: - add_batch_norm=True - if "compression_ratio" in self._hyperparameters: - compression_ratio=self._hyperparameters["compression_ratio"] - - act_rescale_critic=False - if "act_rescale_critic" in self._hyperparameters: - act_rescale_critic=self._hyperparameters["act_rescale_critic"] - if not self._override_agent_actions: - self._agent = SACAgent(obs_dim=self._env.obs_dim(), - obs_ub=self._env.get_obs_ub().flatten().tolist(), - obs_lb=self._env.get_obs_lb().flatten().tolist(), - actions_dim=self._env.actions_dim(), - actions_ub=None, # agent will assume actions are properly normalized in [-1, 1] by the env - actions_lb=None, - rescale_obs=rescale_obs, - norm_obs=norm_obs, - use_action_rescale_for_critic=act_rescale_critic, - compression_ratio=compression_ratio, - device=self._torch_device, - dtype=self._dtype, - is_eval=self._eval, - load_qf=self._load_qf, - debug=self._debug, - layer_width_actor=layer_width_actor, - layer_width_critic=layer_width_critic, - n_hidden_layers_actor=n_hidden_layers_actor, - n_hidden_layers_critic=n_hidden_layers_critic, - torch_compile=use_torch_compile, - add_weight_norm=add_weight_norm, - add_layer_norm=add_layer_norm, - add_batch_norm=add_batch_norm) - else: # we use a fake agent - self._agent = DummyAgent(obs_dim=self._env.obs_dim(), - actions_dim=self._env.actions_dim(), - actions_ub=None, - actions_lb=None, - device=self._torch_device, - dtype=self._dtype, - debug=self._debug) - - # loging actual widths and layers in case they were override inside agent init - self._hyperparameters["actor_lwidth_actual"]=self._agent.layer_width_actor() - self._hyperparameters["actor_n_hlayers_actual"]=self._agent.n_hidden_layers_actor() - self._hyperparameters["critic_lwidth_actual"]=self._agent.layer_width_critic() - self._hyperparameters["critic_n_hlayers_actual"]=self._agent.n_hidden_layers_critic() - - # load model if necessary - if self._eval and (not self._override_agent_actions): # load pretrained model - if model_path is None: - msg = f"No model path provided in eval mode! Was this intentional? \ - No jnt remapping will be available and a randomly init agent will be used." - Journal.log(self.__class__.__name__, - "setup", - msg, - LogType.WARN, - throw_when_excep = True) - if n_eval_timesteps is None: - Journal.log(self.__class__.__name__, - "setup", - f"When eval is True, n_eval_timesteps should be provided!!", - LogType.EXCEP, - throw_when_excep = True) - # everything is ok - self._model_path = model_path - if self._model_path is not None: - self._load_model(self._model_path) - - # overwrite init params - self._init_params(tot_tsteps=n_eval_timesteps, - custom_args=custom_args) - else: - if self._resume: - if model_path is None: - msg = f"No model path provided in resume mode! Please provide a valid checkpoint path." - Journal.log(self.__class__.__name__, - "setup", - msg, - LogType.EXCEP, - throw_when_excep = True) - self._model_path = model_path - if self._model_path is not None: - self._load_model(self._model_path) # load model from checkpoint (including q functions and running normalizers) - self._init_params(tot_tsteps=tot_tsteps, - custom_args=custom_args) - - # adding additional db info - self._hyperparameters["obs_names"]=self._env.obs_names() - self._hyperparameters["action_names"]=self._env.action_names() - self._hyperparameters["sub_reward_names"]=self._env.sub_rew_names() - self._hyperparameters["sub_trunc_names"]=self._env.sub_trunc_names() - self._hyperparameters["sub_term_names"]=self._env.sub_term_names() - - self._allow_expl_during_eval=False - if "allow_expl_during_eval" in self._hyperparameters: - self._allow_expl_during_eval=self._hyperparameters["allow_expl_during_eval"] - - # reset environment - self._env.reset() - if self._eval: - self._env.switch_random_reset(on=False) - - if self._debug and (not self._override_agent_actions): - with torch.no_grad(): - init_obs = self._env.get_obs(clone=True) - _, init_log_pi, _ = self._agent.get_action(init_obs) - init_policy_entropy = (-init_log_pi[0]).mean().item() - init_policy_entropy_per_action = init_policy_entropy / float(self._actions_dim) - Journal.log(self.__class__.__name__, - "setup", - f"Initial policy entropy per action: {init_policy_entropy_per_action:.4f})", - LogType.INFO, - throw_when_excep = True) - - # create dump directory + copy important files for debug - self._init_drop_dir(drop_dir_name) - self._hyperparameters["drop_dir"]=self._drop_dir - - # add env options to hyperparameters - self._hyperparameters.update(self._env_opts) - - if not self._eval: - self._init_agent_optimizers() - - self._init_replay_buffers() # only needed when training - if self._validate: - self._init_validation_buffers() - - if self._autotune: - self._init_alpha_autotuning() - - if self._use_rnd: - self._rnd_net = RNDFull(input_dim=self._rnd_indim, output_dim=self._rnd_outdim, - layer_width=self._rnd_lwidth, n_hidden_layers=self._rnd_hlayers, - device=self._torch_device, - dtype=self._dtype, - normalize=norm_obs # normalize if also used for SAC agent - ) - self._rnd_optimizer = torch.optim.Adam(self._rnd_net.rnd_predictor_net.parameters(), - lr=self._rnd_lr) - - self._rnd_input = torch.full(size=(self._batch_size, self._rnd_net.input_dim()), - fill_value=0.0, - dtype=self._dtype, - device=self._torch_device, - requires_grad=False) - self._rnd_bnorm_input = torch.full(size=(self._bnorm_bsize, self._rnd_net.input_dim()), - fill_value=0.0, - dtype=self._dtype, - device=self._torch_device, - requires_grad=False) - - self._proc_exp_bonus_all = torch.full(size=(self._batch_size, 1), - fill_value=0.0, - dtype=self._dtype, - device=self._torch_device, - requires_grad=False) - self._raw_exp_bonus_all = torch.full(size=(self._batch_size, 1), - fill_value=0.0, - dtype=self._dtype, - device=self._torch_device, - requires_grad=False) - # if self._autotune_rnd_scale: - # self._reward_normalizer=RunningNormalizer((1,), epsilon=1e-8, - # device=self._torch_device, dtype=self._dtype, - # freeze_stats=False, - # debug=self._debug) - - self._init_dbdata() - - if (self._debug): - if self._remote_db: - job_type = "evaluation" if self._eval else "training" - project="IBRIDO-ablations" - wandb.init( - project=project, - group=self._run_name, - name=self._unique_id, - id=self._unique_id, - job_type=job_type, - # tags=None, - notes=comment, - resume="never", # do not allow runs with the same unique id - mode="online", # "online", "offline" or "disabled" - entity=None, - sync_tensorboard=True, - config=self._hyperparameters, - monitor_gym=True, - save_code=True, - dir=self._drop_dir - ) - wandb.watch((self._agent), log="all", log_freq=1000, log_graph=False) - - if "demo_stop_thresh" in self._hyperparameters: - self._demo_stop_thresh=self._hyperparameters["demo_stop_thresh"] - - actions = self._env.get_actions() - self._random_uniform = torch.full_like(actions, fill_value=0.0) # used for sampling random actions (preallocated - # for efficiency) - self._random_normal = torch.full_like(self._random_uniform,fill_value=0.0) - # for efficiency) - - self._actions_override=None - if self._override_agent_actions: - from aug_mpc.utils.shared_data.training_env import Actions - self._actions_override = Actions(namespace=ns+"_override", - n_envs=self._num_envs, - action_dim=actions.shape[1], - action_names=self._env.action_names(), - env_names=None, - is_server=True, - verbose=self._verbose, - vlevel=VLevel.V2, - safe=True, - force_reconnection=True, - with_gpu_mirror=self._use_gpu, - fill_value=0.0) - self._actions_override.run() - - self._start_time_tot = time.perf_counter() - - self._start_time = time.perf_counter() - - self._replay_bf_full = False - self._validation_bf_full = False - self._bpos=0 - self._bpos_val=0 - - self._is_done = False - self._setup_done = True - - def is_done(self): - - return self._is_done - - def model_path(self): - - return self._model_path - - def _init_params(self, - tot_tsteps: int, - custom_args: Dict = {}): - - self._collection_freq=1 - self._update_freq=4 - - self._replay_buffer_size_vec=10*self._task_rand_timeout_ub # cover at least a number of eps - self._replay_buffer_size = self._replay_buffer_size_vec*self._num_envs - if self._replay_buffer_size_vec < 0: # in case env did not properly define _task_rand_timeout_ub - self._replay_buffer_size = int(1e6) - self._replay_buffer_size_vec = self._replay_buffer_size//self._num_envs - self._replay_buffer_size=self._replay_buffer_size_vec*self._num_envs - - self._batch_size = 16394 - - new_transitions_per_batch=self._collection_freq*self._num_envs/self._replay_buffer_size # assumes uniform sampling - self._utd_ratio=self._update_freq/(new_transitions_per_batch*self._batch_size) - - self._lr_policy = 1e-3 - self._lr_q = 5e-4 - - self._discount_factor = 0.99 - if "discount_factor" in custom_args: - self._discount_factor=custom_args["discount_factor"] - - self._smoothing_coeff = 0.01 - - self._policy_freq = 2 - self._trgt_net_freq = 1 - self._rnd_freq = 1 - - # exploration - - # entropy regularization (separate "discrete" and "continuous" actions) - self._entropy_metric_high = 0.5 - self._entropy_metric_low = 0.0 - - # self._entropy_disc_start = -0.05 - # self._entropy_disc_end = -0.5 - - # self._entropy_cont_start = -0.05 - # self._entropy_cont_end = -2.0 - - self._entropy_disc_start = -0.2 - self._entropy_disc_end = -0.2 - - self._entropy_cont_start = -0.8 - self._entropy_cont_end = -0.8 - - # enable/disable entropy annealing (default: enabled) - self._anneal_entropy = False - - self._trgt_avrg_entropy_per_action_disc = self._entropy_disc_start - self._trgt_avrg_entropy_per_action_cont = self._entropy_cont_start - - self._disc_idxs = self._is_discrete_actions.clone().to(torch.long) - self._cont_idxs = self._is_continuous_actions.clone().to(torch.long) - - self._target_entropy_disc = float(self._disc_idxs.numel()) * float(self._trgt_avrg_entropy_per_action_disc) - self._target_entropy_cont = float(self._cont_idxs.numel()) * float(self._trgt_avrg_entropy_per_action_cont) - self._target_entropy = self._target_entropy_disc + self._target_entropy_cont - self._trgt_avrg_entropy_per_action = self._target_entropy / float(max(self._actions_dim, 1)) - self._hyperparameters["anneal_entropy"] = self._anneal_entropy - - self._autotune = True - self._alpha_disc = 0.2 # initial values - self._alpha_cont = 0.2 - self._alpha = 0.5*(self._alpha_disc + self._alpha_cont) - self._log_alpha_disc = math.log(self._alpha_disc) - self._log_alpha_cont = math.log(self._alpha_cont) - self._a_optimizer_disc = None - self._a_optimizer_cont = None - - # random expl ens - self._expl_envs_perc=0.0 # [0, 1] - if "expl_envs_perc" in custom_args: - self._expl_envs_perc=custom_args["expl_envs_perc"] - self._n_expl_envs = int(self._num_envs*self._expl_envs_perc) # n of random envs on which noisy actions will be applied - self._noise_freq_vec = 100 # substeps - self._noise_duration_vec = 5 # should be less than _noise_freq - # correct with env substepping - self._noise_freq_vec=self._noise_freq_vec//self._env_n_action_reps - self._noise_duration_vec=self._noise_duration_vec//self._env_n_action_reps - - self._continuous_act_expl_noise_std=0.3 # wrt actions scale - self._discrete_act_expl_noise_std=1.2 # setting it a bit > 1 helps in ensuring discr. actions range is explored - - # rnd - self._use_rnd=True - self._rnd_net=None - self._rnd_optimizer = None - self._rnd_lr = 1e-3 - if "use_rnd" in custom_args and (not self._eval): - self._use_rnd=custom_args["use_rnd"] - self._rnd_weight=1.0 - self._alpha_rnd=0.0 - self._novelty_scaler=None - if self._use_rnd: - from adarl.utils.NoveltyScaler import NoveltyScaler - - self._novelty_scaler=NoveltyScaler(th_device=self._torch_device, - bonus_weight=self._rnd_weight, - avg_alpha=self._alpha_rnd) - - self._rnd_lwidth=512 - self._rnd_hlayers=3 - self._rnd_outdim=16 - self._rnd_indim=self._obs_dim+self._actions_dim - - # batch normalization - self._bnorm_bsize = 4096 - self._bnorm_vecfreq_nom = 5 # wrt vec steps - # make sure _bnorm_vecfreq is a multiple of _collection_freq - self._bnorm_vecfreq = (self._bnorm_vecfreq_nom//self._collection_freq)*self._collection_freq - if self._bnorm_vecfreq == 0: - self._bnorm_vecfreq=self._collection_freq - self._reward_normalizer=None - - self._total_timesteps = int(tot_tsteps) - # self._total_timesteps = self._total_timesteps//self._env_n_action_reps # correct with n of action reps - self._total_timesteps_vec = self._total_timesteps // self._num_envs - self._total_steps = self._total_timesteps_vec//self._collection_freq - self._total_timesteps_vec = self._total_steps*self._collection_freq # correct to be a multiple of self._total_steps - self._total_timesteps = self._total_timesteps_vec*self._num_envs # actual n transitions - - # self._warmstart_timesteps = int(5e3) - warmstart_length_single_env=min(self._episode_timeout_lb, self._episode_timeout_ub, - self._task_rand_timeout_lb, self._task_rand_timeout_ub) - self._warmstart_timesteps=warmstart_length_single_env*self._num_envs - if self._warmstart_timesteps < self._batch_size: # ensure we collect sufficient experience before - # starting training - self._warmstart_timesteps=4*self._batch_size - self._warmstart_vectimesteps = self._warmstart_timesteps//self._num_envs - # ensuring multiple of collection_freq - self._warmstart_timesteps = self._num_envs*self._warmstart_vectimesteps # actual - - # period nets resets (for tackling the primacy bias) - self._use_period_resets=False - if "use_period_resets" in custom_args: - self._use_period_resets=custom_args["use_period_resets"] - self._adaptive_resets=True # trigger reset based on overfit metric - self._just_one_reset=False - self._periodic_resets_freq=int(4e6) - self._periodic_resets_start=int(1.5e6) - self._periodic_resets_end=int(0.8*self._total_timesteps) - - self._periodic_resets_vecfreq=self._periodic_resets_freq//self._num_envs - self._periodic_resets_vecfreq = (self._periodic_resets_vecfreq//self._collection_freq)*self._collection_freq - self._reset_vecstep_start=self._periodic_resets_start//self._num_envs - self._reset_vecstep_end=self._periodic_resets_end//self._num_envs - - if self._just_one_reset: - # we set the end as the fist reset + a fraction of the reset frequency (this way only one reset will happen) - self._reset_vecstep_end=int(self._reset_vecstep_start+0.8*self._periodic_resets_vecfreq) - - self._periodic_resets_on=False - - # debug - self._m_checkpoint_freq_nom = 1e6 # n totoal timesteps after which a checkpoint model is dumped - self._m_checkpoint_freq= self._m_checkpoint_freq_nom//self._num_envs - - # expl envs - if self._n_expl_envs>0 and ((self._num_envs-self._n_expl_envs)>0): # log data only from envs which are not altered (e.g. by exploration noise) - # computing expl env selector - self._expl_env_selector = torch.randperm(self._num_envs, device="cpu")[:self._n_expl_envs] - self._expl_env_selector_bool[self._expl_env_selector]=True - - # demo envs - if self._demo_env_selector_bool is None: - self._db_env_selector_bool[:]=~self._expl_env_selector_bool - else: # we log db data separately for env which are neither for demo nor for random exploration - self._demo_env_selector_bool=self._demo_env_selector_bool.cpu() - self._demo_env_selector=self._demo_env_selector.cpu() - self._db_env_selector_bool[:]=torch.logical_and(~self._expl_env_selector_bool, ~self._demo_env_selector_bool) - - self._n_expl_envs = self._expl_env_selector_bool.count_nonzero() - self._num_db_envs = self._db_env_selector_bool.count_nonzero() - - if not self._num_db_envs>0: - Journal.log(self.__class__.__name__, - "_init_params", - "No indipendent db env can be computed (check your demo and expl settings)! Will use all envs.", - LogType.EXCEP, - throw_when_excep = False) - self._num_db_envs=self._num_envs - self._db_env_selector_bool[:]=True - self._db_env_selector=torch.nonzero(self._db_env_selector_bool).flatten() - - self._transition_noise_freq=float(self._noise_duration_vec)/float(self._noise_freq_vec) - self._env_noise_freq=float(self._n_expl_envs)/float(self._num_envs) - self._noise_buff_freq=self._transition_noise_freq*self._env_noise_freq - - self._db_vecstep_frequency = 32 # log db data every n (vectorized) SUB timesteps - self._db_vecstep_frequency=round(self._db_vecstep_frequency/self._env_n_action_reps) # correcting with actions reps - # correct db vecstep frequency to ensure it's a multiple of self._collection_freq - self._db_vecstep_frequency=(self._db_vecstep_frequency//self._collection_freq)*self._collection_freq - if self._db_vecstep_frequency == 0: - self._db_vecstep_frequency=self._collection_freq - - self._env_db_checkpoints_vecfreq=150*self._db_vecstep_frequency # detailed db data from envs - - self._validate=True - self._validation_collection_vecfreq=50 # add vec transitions to val buffer with some vec freq - self._validation_ratio=1.0/self._validation_collection_vecfreq # [0, 1], 0.1 10% size of training buffer - self._validation_buffer_size_vec = int(self._replay_buffer_size*self._validation_ratio)//self._num_envs - self._validation_buffer_size = self._validation_buffer_size_vec*self._num_envs - self._validation_batch_size = int(self._batch_size*self._validation_ratio) - self._validation_db_vecstep_freq=self._db_vecstep_frequency - if self._eval: # no need for validation transitions during evaluation - self._validate=False - self._overfit_idx=0.0 - self._overfit_idx_alpha=0.03 # exponential MA - self._overfit_idx_thresh=2.0 - - self._n_policy_updates_to_be_done=(self._total_steps-self._warmstart_vectimesteps)*self._update_freq #TD3 delayed update - self._n_qf_updates_to_be_done=(self._total_steps-self._warmstart_vectimesteps)*self._update_freq # qf updated at each vec timesteps - self._n_tqf_updates_to_be_done=(self._total_steps-self._warmstart_vectimesteps)*self._update_freq//self._trgt_net_freq - - self._exp_to_policy_grad_ratio=float(self._total_timesteps-self._warmstart_timesteps)/float(self._n_policy_updates_to_be_done) - self._exp_to_qf_grad_ratio=float(self._total_timesteps-self._warmstart_timesteps)/float(self._n_qf_updates_to_be_done) - self._exp_to_qft_grad_ratio=float(self._total_timesteps-self._warmstart_timesteps)/float(self._n_tqf_updates_to_be_done) - - self._db_data_size = round(self._total_timesteps_vec/self._db_vecstep_frequency)+self._db_vecstep_frequency - - # write them to hyperparam dictionary for debugging - self._hyperparameters["n_envs"] = self._num_envs - self._hyperparameters["obs_dim"] = self._obs_dim - self._hyperparameters["actions_dim"] = self._actions_dim - - self._hyperparameters["seed"] = self._seed - self._hyperparameters["using_gpu"] = self._use_gpu - self._hyperparameters["total_timesteps_vec"] = self._total_timesteps_vec - - self._hyperparameters["collection_freq"]=self._collection_freq - self._hyperparameters["update_freq"]=self._update_freq - self._hyperparameters["total_steps"]=self._total_steps - - self._hyperparameters["utd_ratio"] = self._utd_ratio - - self._hyperparameters["n_policy_updates_when_done"] = self._n_policy_updates_to_be_done - self._hyperparameters["n_qf_updates_when_done"] = self._n_qf_updates_to_be_done - self._hyperparameters["n_tqf_updates_when_done"] = self._n_tqf_updates_to_be_done - self._hyperparameters["experience_to_policy_grad_steps_ratio"] = self._exp_to_policy_grad_ratio - self._hyperparameters["experience_to_quality_fun_grad_steps_ratio"] = self._exp_to_qf_grad_ratio - self._hyperparameters["experience_to_trgt_quality_fun_grad_steps_ratio"] = self._exp_to_qft_grad_ratio - - self._hyperparameters["episodes timeout lb"] = self._episode_timeout_lb - self._hyperparameters["episodes timeout ub"] = self._episode_timeout_ub - self._hyperparameters["task rand timeout lb"] = self._task_rand_timeout_lb - self._hyperparameters["task rand timeout ub"] = self._task_rand_timeout_ub - - self._hyperparameters["warmstart_timesteps"] = self._warmstart_timesteps - self._hyperparameters["warmstart_vectimesteps"] = self._warmstart_vectimesteps - self._hyperparameters["replay_buffer_size"] = self._replay_buffer_size - self._hyperparameters["batch_size"] = self._batch_size - self._hyperparameters["total_timesteps"] = self._total_timesteps - self._hyperparameters["lr_policy"] = self._lr_policy - self._hyperparameters["lr_q"] = self._lr_q - self._hyperparameters["discount_factor"] = self._discount_factor - self._hyperparameters["smoothing_coeff"] = self._smoothing_coeff - self._hyperparameters["policy_freq"] = self._policy_freq - self._hyperparameters["trgt_net_freq"] = self._trgt_net_freq - self._hyperparameters["autotune"] = self._autotune - self._hyperparameters["target_entropy"] = self._target_entropy - self._hyperparameters["target_entropy_disc"] = self._target_entropy_disc - self._hyperparameters["target_entropy_cont"] = self._target_entropy_cont - self._hyperparameters["disc_entropy_idxs"] = self._disc_idxs.tolist() - self._hyperparameters["cont_entropy_idxs"] = self._cont_idxs.tolist() - self._hyperparameters["log_alpha_disc"] = None if self._log_alpha_disc is None else self._log_alpha_disc - self._hyperparameters["log_alpha_cont"] = None if self._log_alpha_cont is None else self._log_alpha_cont - self._hyperparameters["alpha"] = self._alpha - self._hyperparameters["alpha_disc"] = self._alpha_disc - self._hyperparameters["alpha_cont"] = self._alpha_cont - self._hyperparameters["m_checkpoint_freq"] = self._m_checkpoint_freq - self._hyperparameters["db_vecstep_frequency"] = self._db_vecstep_frequency - self._hyperparameters["m_checkpoint_freq"] = self._m_checkpoint_freq - - self._hyperparameters["use_period_resets"]= self._use_period_resets - self._hyperparameters["just_one_reset"]= self._just_one_reset - self._hyperparameters["period_resets_vecfreq"]= self._periodic_resets_vecfreq - self._hyperparameters["period_resets_vecstart"]= self._reset_vecstep_start - self._hyperparameters["period_resets_vecend"]= self._reset_vecstep_end - self._hyperparameters["period_resets_freq"]= self._periodic_resets_freq - self._hyperparameters["period_resets_start"]= self._periodic_resets_start - self._hyperparameters["period_resets_end"]= self._periodic_resets_end - - self._hyperparameters["use_rnd"] = self._use_rnd - self._hyperparameters["rnd_lwidth"] = self._rnd_lwidth - self._hyperparameters["rnd_hlayers"] = self._rnd_hlayers - self._hyperparameters["rnd_outdim"] = self._rnd_outdim - self._hyperparameters["rnd_indim"] = self._rnd_indim - - self._hyperparameters["n_db_envs"] = self._num_db_envs - self._hyperparameters["n_expl_envs"] = self._n_expl_envs - self._hyperparameters["noise_freq"] = self._noise_freq_vec - self._hyperparameters["noise_duration_vec"] = self._noise_duration_vec - self._hyperparameters["noise_buff_freq"] = self._noise_buff_freq - self._hyperparameters["continuous_act_expl_noise_std"] = self._continuous_act_expl_noise_std - self._hyperparameters["discrete_act_expl_noise_std"] = self._discrete_act_expl_noise_std - - self._hyperparameters["n_demo_envs"] = self._env.n_demo_envs() - - self._hyperparameters["bnorm_bsize"] = self._bnorm_bsize - self._hyperparameters["bnorm_vecfreq"] = self._bnorm_vecfreq - - self._hyperparameters["validate"] = self._validate - self._hyperparameters["validation_ratio"] = self._validation_ratio - self._hyperparameters["validation_buffer_size_vec"] = self._validation_buffer_size_vec - self._hyperparameters["validation_buffer_size"] = self._validation_buffer_size - self._hyperparameters["validation_batch_size"] = self._validation_batch_size - self._hyperparameters["validation_collection_vecfreq"] = self._validation_collection_vecfreq - - # small debug log - info = f"\nUsing \n" + \ - f"total (vectorized) timesteps to be simulated {self._total_timesteps_vec}\n" + \ - f"total timesteps to be simulated {self._total_timesteps}\n" + \ - f"warmstart timesteps {self._warmstart_timesteps}\n" + \ - f"training replay buffer size {self._replay_buffer_size}\n" + \ - f"training replay buffer vec size {self._replay_buffer_size_vec}\n" + \ - f"training batch size {self._batch_size}\n" + \ - f"validation enabled {self._validate}\n" + \ - f"validation buffer size {self._validation_buffer_size}\n" + \ - f"validation buffer vec size {self._validation_buffer_size_vec}\n" + \ - f"validation collection freq {self._validation_collection_vecfreq}\n" + \ - f"validation update freq {self._validation_db_vecstep_freq}\n" + \ - f"validation batch size {self._validation_batch_size}\n" + \ - f"policy update freq {self._policy_freq}\n" + \ - f"target networks freq {self._trgt_net_freq}\n" + \ - f"episode timeout max steps {self._episode_timeout_ub}\n" + \ - f"episode timeout min steps {self._episode_timeout_lb}\n" + \ - f"task rand. max n steps {self._task_rand_timeout_ub}\n" + \ - f"task rand. min n steps {self._task_rand_timeout_lb}\n" + \ - f"number of action reps {self._env_n_action_reps}\n" + \ - f"total policy updates to be performed: {self._n_policy_updates_to_be_done}\n" + \ - f"total q fun updates to be performed: {self._n_qf_updates_to_be_done}\n" + \ - f"total trgt q fun updates to be performed: {self._n_tqf_updates_to_be_done}\n" + \ - f"experience to policy grad ratio: {self._exp_to_policy_grad_ratio}\n" + \ - f"experience to q fun grad ratio: {self._exp_to_qf_grad_ratio}\n" + \ - f"experience to trgt q fun grad ratio: {self._exp_to_qft_grad_ratio}\n" + \ - f"amount of noisy transitions in replay buffer: {self._noise_buff_freq*100}% \n" - db_env_idxs=", ".join(map(str, self._db_env_selector.tolist())) - n_db_envs_str=f"db envs {self._num_db_envs}/{self._num_envs} \n" - info=info + n_db_envs_str + "Debug env. indexes are [" + db_env_idxs+"]\n" - if self._env.demo_env_idxs() is not None: - demo_idxs_str=", ".join(map(str, self._env.demo_env_idxs().tolist())) - n_demo_envs_str=f"demo envs {self._env.n_demo_envs()}/{self._num_envs} \n" - info=info + n_demo_envs_str + "Demo env. indexes are [" + demo_idxs_str+"]\n" - if self._expl_env_selector is not None: - random_expl_idxs=", ".join(map(str, self._expl_env_selector.tolist())) - n_expl_envs_str=f"expl envs {self._n_expl_envs}/{self._num_envs} \n" - info=info + n_expl_envs_str + "Random exploration env. indexes are [" + random_expl_idxs+"]\n" - - Journal.log(self.__class__.__name__, - "_init_params", - info, - LogType.INFO, - throw_when_excep = True) - - # init counters - self._step_counter = 0 - self._vec_transition_counter = 0 - self._update_counter = 0 - self._log_it_counter = 0 - - def _init_dbdata(self): - - # initalize some debug data - self._collection_dt = torch.full((self._db_data_size, 1), - dtype=torch.float32, fill_value=0.0, device="cpu") - self._collection_t = -1.0 - - self._env_step_fps = torch.full((self._db_data_size, 1), - dtype=torch.float32, fill_value=0.0, device="cpu") - self._env_step_rt_factor = torch.full((self._db_data_size, 1), - dtype=torch.float32, fill_value=0.0, device="cpu") - - self._batch_norm_update_dt = torch.full((self._db_data_size, 1), - dtype=torch.float32, fill_value=0.0, device="cpu") - - self._policy_update_t_start = -1.0 - self._policy_update_t = -1.0 - self._policy_update_dt = torch.full((self._db_data_size, 1), - dtype=torch.float32, fill_value=0.0, device="cpu") - self._policy_update_fps = torch.full((self._db_data_size, 1), - dtype=torch.float32, fill_value=0.0, device="cpu") - - self._validation_t = -1.0 - self._validation_dt = torch.full((self._db_data_size, 1), - dtype=torch.float32, fill_value=0.0, device="cpu") - - self._n_of_played_episodes = torch.full((self._db_data_size, 1), - dtype=torch.int32, fill_value=0, device="cpu") - self._n_timesteps_done = torch.full((self._db_data_size, 1), - dtype=torch.int32, fill_value=0, device="cpu") - self._n_policy_updates = torch.full((self._db_data_size, 1), - dtype=torch.int32, fill_value=0, device="cpu") - self._n_qfun_updates = torch.full((self._db_data_size, 1), - dtype=torch.int32, fill_value=0, device="cpu") - self._n_tqfun_updates = torch.full((self._db_data_size, 1), - dtype=torch.int32, fill_value=0, device="cpu") - self._elapsed_min = torch.full((self._db_data_size, 1), - dtype=torch.float32, fill_value=0.0, device="cpu") - - self._ep_tsteps_env_distribution = torch.full((self._db_data_size, self._num_db_envs, 1), - dtype=torch.int32, fill_value=-1, device="cpu") - - self._reward_names = self._episodic_reward_metrics.reward_names() - self._reward_names_str = "[" + ', '.join(self._reward_names) + "]" - self._n_rewards = self._episodic_reward_metrics.n_rewards() - - # db environments - self._tot_rew_max = torch.full((self._db_data_size, self._num_db_envs, 1), - dtype=torch.float32, fill_value=torch.nan, device="cpu") - self._tot_rew_avrg = torch.full((self._db_data_size, self._num_db_envs, 1), - dtype=torch.float32, fill_value=torch.nan, device="cpu") - self._tot_rew_min = torch.full((self._db_data_size, self._num_db_envs, 1), - dtype=torch.float32, fill_value=torch.nan, device="cpu") - self._tot_rew_max_over_envs = torch.full((self._db_data_size, 1, 1), - dtype=torch.float32, fill_value=torch.nan, device="cpu") - self._tot_rew_min_over_envs = torch.full((self._db_data_size, 1, 1), - dtype=torch.float32, fill_value=torch.nan, device="cpu") - self._tot_rew_avrg_over_envs = torch.full((self._db_data_size, 1, 1), - dtype=torch.float32, fill_value=torch.nan, device="cpu") - self._tot_rew_std_over_envs = torch.full((self._db_data_size, 1, 1), - dtype=torch.float32, fill_value=torch.nan, device="cpu") - - self._sub_rew_max = torch.full((self._db_data_size, self._num_db_envs, self._n_rewards), - dtype=torch.float32, fill_value=torch.nan, device="cpu") - self._sub_rew_avrg = torch.full((self._db_data_size, self._num_db_envs, self._n_rewards), - dtype=torch.float32, fill_value=torch.nan, device="cpu") - self._sub_rew_min = torch.full((self._db_data_size, self._num_db_envs, self._n_rewards), - dtype=torch.float32, fill_value=torch.nan, device="cpu") - self._sub_rew_max_over_envs = torch.full((self._db_data_size, 1, self._n_rewards), - dtype=torch.float32, fill_value=torch.nan, device="cpu") - self._sub_rew_min_over_envs = torch.full((self._db_data_size, 1, self._n_rewards), - dtype=torch.float32, fill_value=torch.nan, device="cpu") - self._sub_rew_avrg_over_envs = torch.full((self._db_data_size, 1, self._n_rewards), - dtype=torch.float32, fill_value=torch.nan, device="cpu") - self._sub_rew_std_over_envs = torch.full((self._db_data_size, 1, self._n_rewards), - dtype=torch.float32, fill_value=torch.nan, device="cpu") - - # custom data from env # (log data just from db envs for simplicity) - self._custom_env_data = {} - db_data_names = list(self._env.custom_db_data.keys()) - for dbdatan in db_data_names: # loop thorugh custom data - - self._custom_env_data[dbdatan] = {} - - max = self._env.custom_db_data[dbdatan].get_max(env_selector=self._db_env_selector).reshape(self._num_db_envs, -1) - avrg = self._env.custom_db_data[dbdatan].get_avrg(env_selector=self._db_env_selector).reshape(self._num_db_envs, -1) - min = self._env.custom_db_data[dbdatan].get_min(env_selector=self._db_env_selector).reshape(self._num_db_envs, -1) - max_over_envs = self._env.custom_db_data[dbdatan].get_max_over_envs(env_selector=self._db_env_selector).reshape(1, -1) - min_over_envs = self._env.custom_db_data[dbdatan].get_min_over_envs(env_selector=self._db_env_selector).reshape(1, -1) - avrg_over_envs = self._env.custom_db_data[dbdatan].get_avrg_over_envs(env_selector=self._db_env_selector).reshape(1, -1) - std_over_envs = self._env.custom_db_data[dbdatan].get_std_over_envs(env_selector=self._db_env_selector).reshape(1, -1) - - self._custom_env_data[dbdatan]["max"] =torch.full((self._db_data_size, - max.shape[0], - max.shape[1]), - dtype=torch.float32, fill_value=torch.nan, device="cpu") - self._custom_env_data[dbdatan]["avrg"] =torch.full((self._db_data_size, - avrg.shape[0], - avrg.shape[1]), - dtype=torch.float32, fill_value=torch.nan, device="cpu") - self._custom_env_data[dbdatan]["min"] =torch.full((self._db_data_size, - min.shape[0], - min.shape[1]), - dtype=torch.float32, fill_value=torch.nan, device="cpu") - self._custom_env_data[dbdatan]["max_over_envs"] =torch.full((self._db_data_size, - max_over_envs.shape[0], - max_over_envs.shape[1]), - dtype=torch.float32, fill_value=torch.nan, device="cpu") - self._custom_env_data[dbdatan]["min_over_envs"] =torch.full((self._db_data_size, - min_over_envs.shape[0], - min_over_envs.shape[1]), - dtype=torch.float32, fill_value=torch.nan, device="cpu") - self._custom_env_data[dbdatan]["avrg_over_envs"] =torch.full((self._db_data_size, - avrg_over_envs.shape[0], - avrg_over_envs.shape[1]), - dtype=torch.float32, fill_value=torch.nan, device="cpu") - self._custom_env_data[dbdatan]["std_over_envs"] =torch.full((self._db_data_size, - std_over_envs.shape[0], - std_over_envs.shape[1]), - dtype=torch.float32, fill_value=torch.nan, device="cpu") - - # exploration envs - if self._n_expl_envs > 0: - self._ep_tsteps_expl_env_distribution = torch.full((self._db_data_size, self._n_expl_envs, 1), - dtype=torch.int32, fill_value=-1, device="cpu") - - # also log sub rewards metrics for exploration envs - self._sub_rew_max_expl = torch.full((self._db_data_size, self._n_expl_envs, self._n_rewards), - dtype=torch.float32, fill_value=torch.nan, device="cpu") - self._sub_rew_avrg_expl = torch.full((self._db_data_size, self._n_expl_envs, self._n_rewards), - dtype=torch.float32, fill_value=torch.nan, device="cpu") - self._sub_rew_min_expl = torch.full((self._db_data_size, self._n_expl_envs, self._n_rewards), - dtype=torch.float32, fill_value=torch.nan, device="cpu") - self._sub_rew_max_over_envs_expl = torch.full((self._db_data_size, 1, self._n_rewards), - dtype=torch.float32, fill_value=torch.nan, device="cpu") - self._sub_rew_min_over_envs_expl = torch.full((self._db_data_size, 1, self._n_rewards), - dtype=torch.float32, fill_value=torch.nan, device="cpu") - self._sub_rew_avrg_over_envs_expl = torch.full((self._db_data_size, 1, self._n_rewards), - dtype=torch.float32, fill_value=torch.nan, device="cpu") - self._sub_rew_std_over_envs_expl = torch.full((self._db_data_size, 1, self._n_rewards), - dtype=torch.float32, fill_value=torch.nan, device="cpu") - - # demo environments - self._demo_envs_active = torch.full((self._db_data_size, 1), - dtype=torch.float32, fill_value=torch.nan, device="cpu") - self._demo_perf_metric = torch.full((self._db_data_size, 1), - dtype=torch.float32, fill_value=torch.nan, device="cpu") - if self._env.demo_env_idxs() is not None: - n_demo_envs=self._env.demo_env_idxs().shape[0] - - self._ep_tsteps_demo_env_distribution = torch.full((self._db_data_size, n_demo_envs, 1), - dtype=torch.int32, fill_value=-1, device="cpu") - - # also log sub rewards metrics for exploration envs - self._sub_rew_max_demo = torch.full((self._db_data_size, n_demo_envs, self._n_rewards), - dtype=torch.float32, fill_value=torch.nan, device="cpu") - self._sub_rew_avrg_demo = torch.full((self._db_data_size, n_demo_envs, self._n_rewards), - dtype=torch.float32, fill_value=torch.nan, device="cpu") - self._sub_rew_min_demo = torch.full((self._db_data_size, n_demo_envs, self._n_rewards), - dtype=torch.float32, fill_value=torch.nan, device="cpu") - self._sub_rew_max_over_envs_demo = torch.full((self._db_data_size, 1, self._n_rewards), - dtype=torch.float32, fill_value=torch.nan, device="cpu") - self._sub_rew_min_over_envs_demo = torch.full((self._db_data_size, 1, self._n_rewards), - dtype=torch.float32, fill_value=torch.nan, device="cpu") - self._sub_rew_avrg_over_envs_demo = torch.full((self._db_data_size, 1, self._n_rewards), - dtype=torch.float32, fill_value=torch.nan, device="cpu") - self._sub_rew_std_over_envs_demo = torch.full((self._db_data_size, 1, self._n_rewards), - dtype=torch.float32, fill_value=torch.nan, device="cpu") - - # algorithm-specific db info - self._qf1_vals_mean = torch.full((self._db_data_size, 1), - dtype=torch.float32, fill_value=torch.nan, device="cpu") - self._qf2_vals_mean = torch.full((self._db_data_size, 1), - dtype=torch.float32, fill_value=torch.nan, device="cpu") - self._min_qft_vals_mean = torch.full((self._db_data_size, 1), - dtype=torch.float32, fill_value=torch.nan, device="cpu") - self._qf1_vals_std = torch.full((self._db_data_size, 1), - dtype=torch.float32, fill_value=torch.nan, device="cpu") - self._qf2_vals_std = torch.full((self._db_data_size, 1), - dtype=torch.float32, fill_value=torch.nan, device="cpu") - self._min_qft_vals_std = torch.full((self._db_data_size, 1), - dtype=torch.float32, fill_value=torch.nan, device="cpu") - self._qf1_vals_max = torch.full((self._db_data_size, 1), - dtype=torch.float32, fill_value=torch.nan, device="cpu") - self._qf1_vals_min = torch.full((self._db_data_size, 1), - dtype=torch.float32, fill_value=torch.nan, device="cpu") - self._qf2_vals_max = torch.full((self._db_data_size, 1), - dtype=torch.float32, fill_value=torch.nan, device="cpu") - self._qf2_vals_min = torch.full((self._db_data_size, 1), - dtype=torch.float32, fill_value=torch.nan, device="cpu") - - self._qf1_loss = torch.full((self._db_data_size, 1), - dtype=torch.float32, fill_value=torch.nan, device="cpu") - self._qf2_loss = torch.full((self._db_data_size, 1), - dtype=torch.float32, fill_value=torch.nan, device="cpu") - self._actor_loss= torch.full((self._db_data_size, 1), - dtype=torch.float32, fill_value=torch.nan, device="cpu") - self._alpha_loss = torch.full((self._db_data_size, 1), - dtype=torch.float32, fill_value=torch.nan, device="cpu") - self._alpha_loss_disc = torch.full((self._db_data_size, 1), - dtype=torch.float32, fill_value=torch.nan, device="cpu") - self._alpha_loss_cont = torch.full((self._db_data_size, 1), - dtype=torch.float32, fill_value=torch.nan, device="cpu") - if self._validate: # add db data for validation losses - self._overfit_index = torch.full((self._db_data_size, 1), - dtype=torch.float32, fill_value=torch.nan, device="cpu") - self._qf1_loss_validation = torch.full((self._db_data_size, 1), - dtype=torch.float32, fill_value=torch.nan, device="cpu") - self._qf2_loss_validation = torch.full((self._db_data_size, 1), - dtype=torch.float32, fill_value=torch.nan, device="cpu") - self._actor_loss_validation= torch.full((self._db_data_size, 1), - dtype=torch.float32, fill_value=torch.nan, device="cpu") - self._alpha_loss_validation = torch.full((self._db_data_size, 1), - dtype=torch.float32, fill_value=torch.nan, device="cpu") - self._alpha_loss_disc_validation = torch.full((self._db_data_size, 1), - dtype=torch.float32, fill_value=torch.nan, device="cpu") - self._alpha_loss_cont_validation = torch.full((self._db_data_size, 1), - dtype=torch.float32, fill_value=torch.nan, device="cpu") - - self._alphas = torch.full((self._db_data_size, 1), - dtype=torch.float32, fill_value=torch.nan, device="cpu") - self._alphas_disc = torch.full((self._db_data_size, 1), - dtype=torch.float32, fill_value=torch.nan, device="cpu") - self._alphas_cont = torch.full((self._db_data_size, 1), - dtype=torch.float32, fill_value=torch.nan, device="cpu") - - self._policy_entropy_mean=torch.full((self._db_data_size, 1), - dtype=torch.float32, fill_value=torch.nan, device="cpu") - self._policy_entropy_std=torch.full((self._db_data_size, 1), - dtype=torch.float32, fill_value=torch.nan, device="cpu") - self._policy_entropy_max=torch.full((self._db_data_size, 1), - dtype=torch.float32, fill_value=torch.nan, device="cpu") - self._policy_entropy_min=torch.full((self._db_data_size, 1), - dtype=torch.float32, fill_value=torch.nan, device="cpu") - self._policy_entropy_disc_mean=torch.full((self._db_data_size, 1), - dtype=torch.float32, fill_value=torch.nan, device="cpu") - self._policy_entropy_disc_std=torch.full((self._db_data_size, 1), - dtype=torch.float32, fill_value=torch.nan, device="cpu") - self._policy_entropy_disc_max=torch.full((self._db_data_size, 1), - dtype=torch.float32, fill_value=torch.nan, device="cpu") - self._policy_entropy_disc_min=torch.full((self._db_data_size, 1), - dtype=torch.float32, fill_value=torch.nan, device="cpu") - self._policy_entropy_cont_mean=torch.full((self._db_data_size, 1), - dtype=torch.float32, fill_value=torch.nan, device="cpu") - self._policy_entropy_cont_std=torch.full((self._db_data_size, 1), - dtype=torch.float32, fill_value=torch.nan, device="cpu") - self._policy_entropy_cont_max=torch.full((self._db_data_size, 1), - dtype=torch.float32, fill_value=torch.nan, device="cpu") - self._policy_entropy_cont_min=torch.full((self._db_data_size, 1), - dtype=torch.float32, fill_value=torch.nan, device="cpu") - - self._running_mean_obs=None - self._running_std_obs=None - if self._agent.obs_running_norm is not None and not self._eval: - # some db data for the agent - self._running_mean_obs = torch.full((self._db_data_size, self._env.obs_dim()), - dtype=torch.float32, fill_value=0.0, device="cpu") - self._running_std_obs = torch.full((self._db_data_size, self._env.obs_dim()), - dtype=torch.float32, fill_value=0.0, device="cpu") - - # RND - self._rnd_loss=None - if self._use_rnd: - self._rnd_loss = torch.full((self._db_data_size, 1), - dtype=torch.float32, fill_value=torch.nan, device="cpu") - - self._expl_bonus_raw_avrg = torch.full((self._db_data_size, 1), - dtype=torch.float32, fill_value=torch.nan, device="cpu") - self._expl_bonus_raw_std = torch.full((self._db_data_size, 1), - dtype=torch.float32, fill_value=torch.nan, device="cpu") - # self._expl_bonus_raw_min = torch.full((self._db_data_size, 1), - # dtype=torch.float32, fill_value=torch.nan, device="cpu") - # self._expl_bonus_raw_max = torch.full((self._db_data_size, 1), - # dtype=torch.float32, fill_value=torch.nan, device="cpu") - - self._expl_bonus_proc_avrg = torch.full((self._db_data_size, 1), - dtype=torch.float32, fill_value=torch.nan, device="cpu") - self._expl_bonus_proc_std = torch.full((self._db_data_size, 1), - dtype=torch.float32, fill_value=torch.nan, device="cpu") - # self._expl_bonus_proc_min = torch.full((self._db_data_size, 1), - # dtype=torch.float32, fill_value=torch.nan, device="cpu") - # self._expl_bonus_proc_max = torch.full((self._db_data_size, 1), - # dtype=torch.float32, fill_value=torch.nan, device="cpu") - - self._n_rnd_updates = torch.full((self._db_data_size, 1), - dtype=torch.int32, fill_value=0, device="cpu") - self._running_mean_rnd_input = None - self._running_std_rnd_input = None - if self._rnd_net.obs_running_norm is not None: - self._running_mean_rnd_input = torch.full((self._db_data_size, self._rnd_net.input_dim()), - dtype=torch.float32, fill_value=0.0, device="cpu") - self._running_std_rnd_input = torch.full((self._db_data_size, self._rnd_net.input_dim()), - dtype=torch.float32, fill_value=0.0, device="cpu") - - def _init_agent_optimizers(self): - self._qf_optimizer = optim.Adam(list(self._agent.qf1.parameters()) + list(self._agent.qf2.parameters()), - lr=self._lr_q) - self._actor_optimizer = optim.Adam(list(self._agent.actor.parameters()), - lr=self._lr_policy) - - def _init_alpha_autotuning(self): - self._log_alpha_disc = torch.full((1,), fill_value=math.log(self._alpha_disc), requires_grad=True, device=self._torch_device) - self._log_alpha_cont = torch.full((1,), fill_value=math.log(self._alpha_cont), requires_grad=True, device=self._torch_device) - self._alpha_disc = self._log_alpha_disc.exp().item() - self._alpha_cont = self._log_alpha_cont.exp().item() - self._alpha = 0.5*(self._alpha_disc + self._alpha_cont) - self._a_optimizer_disc = optim.Adam([self._log_alpha_disc], lr=self._lr_q) - self._a_optimizer_cont = optim.Adam([self._log_alpha_cont], lr=self._lr_q) - - def _init_replay_buffers(self): - - self._bpos = 0 - - self._obs = torch.full(size=(self._replay_buffer_size_vec, self._num_envs, self._obs_dim), - fill_value=torch.nan, - dtype=self._dtype, - device=self._torch_device, - requires_grad=False) - self._actions = torch.full(size=(self._replay_buffer_size_vec, self._num_envs, self._actions_dim), - fill_value=torch.nan, - dtype=self._dtype, - device=self._torch_device, - requires_grad=False) - self._rewards = torch.full(size=(self._replay_buffer_size_vec, self._num_envs, 1), - fill_value=torch.nan, - dtype=self._dtype, - device=self._torch_device, - requires_grad=False) - self._next_obs = torch.full(size=(self._replay_buffer_size_vec, self._num_envs, self._obs_dim), - fill_value=torch.nan, - dtype=self._dtype, - device=self._torch_device, - requires_grad=False) - self._next_terminal = torch.full(size=(self._replay_buffer_size_vec, self._num_envs, 1), - fill_value=False, - dtype=self._dtype, - device=self._torch_device, - requires_grad=False) - - def _init_validation_buffers(self): - - self._bpos_val = 0 - - self._obs_val = torch.full(size=(self._validation_buffer_size_vec, self._num_envs, self._obs_dim), - fill_value=torch.nan, - dtype=self._dtype, - device=self._torch_device, - requires_grad=False) - self._actions_val = torch.full(size=(self._validation_buffer_size_vec, self._num_envs, self._actions_dim), - fill_value=torch.nan, - dtype=self._dtype, - device=self._torch_device, - requires_grad=False) - self._rewards_val = torch.full(size=(self._validation_buffer_size_vec, self._num_envs, 1), - fill_value=torch.nan, - dtype=self._dtype, - device=self._torch_device, - requires_grad=False) - self._next_obs_val = torch.full(size=(self._validation_buffer_size_vec, self._num_envs, self._obs_dim), - fill_value=torch.nan, - dtype=self._dtype, - device=self._torch_device, - requires_grad=False) - self._next_terminal_val = torch.full(size=(self._validation_buffer_size_vec, self._num_envs, 1), - fill_value=False, - dtype=self._dtype, - device=self._torch_device, - requires_grad=False) - - def _save_model(self, - is_checkpoint: bool = False): - - path = self._model_path - if is_checkpoint: # use iteration as id - path = path + "_checkpoint" + str(self._log_it_counter) - info = f"Saving model to {path}" - Journal.log(self.__class__.__name__, - "_save_model", - info, - LogType.INFO, - throw_when_excep = True) - agent_state_dict=self._agent.state_dict() - if not self._eval: # training - # we log the joints which were observed during training - observed_joints=self._env.get_observed_joints() - if observed_joints is not None: - agent_state_dict["observed_jnts"]=self._env.get_observed_joints() - - torch.save(agent_state_dict, path) # saves whole agent state - # torch.save(self._agent.parameters(), path) # only save agent parameters - info = f"Done." - Journal.log(self.__class__.__name__, - "_save_model", - info, - LogType.INFO, - throw_when_excep = True) - - def _dump_env_checkpoints(self): - - path = self._env_db_checkpoints_fname+str(self._log_it_counter) - - if path is not None: - info = f"Saving env db checkpoint data to {path}" - Journal.log(self.__class__.__name__, - "_dump_env_checkpoints", - info, - LogType.INFO, - throw_when_excep = True) - - with h5py.File(path+".hdf5", 'w') as hf: - - for key, value in self._hyperparameters.items(): - if value is None: - value = "None" - hf.attrs[key] = value - - # full training envs - sub_rew_full=self._episodic_reward_metrics.get_full_episodic_subrew(env_selector=self._db_env_selector) - tot_rew_full=self._episodic_reward_metrics.get_full_episodic_totrew(env_selector=self._db_env_selector) - - if self._n_expl_envs > 0: - sub_rew_full_expl=self._episodic_reward_metrics.get_full_episodic_subrew(env_selector=self._expl_env_selector) - tot_rew_full_expl=self._episodic_reward_metrics.get_full_episodic_totrew(env_selector=self._expl_env_selector) - if self._env.n_demo_envs() > 0: - sub_rew_full_demo=self._episodic_reward_metrics.get_full_episodic_subrew(env_selector=self._demo_env_selector) - tot_rew_full_demo=self._episodic_reward_metrics.get_full_episodic_totrew(env_selector=self._demo_env_selector) - - ep_vec_freq=self._episodic_reward_metrics.ep_vec_freq() # assuming all db data was collected with the same ep_vec_freq - - hf.attrs['sub_reward_names'] = self._reward_names - hf.attrs['log_iteration'] = self._log_it_counter - hf.attrs['n_timesteps_done'] = self._n_timesteps_done[self._log_it_counter] - hf.attrs['n_policy_updates'] = self._n_policy_updates[self._log_it_counter] - hf.attrs['elapsed_min'] = self._elapsed_min[self._log_it_counter] - hf.attrs['ep_vec_freq'] = ep_vec_freq - hf.attrs['n_expl_envs'] = self._n_expl_envs - hf.attrs['n_demo_envs'] = self._env.n_demo_envs() - - # first dump custom db data names - db_data_names = list(self._env.custom_db_data.keys()) - for db_dname in db_data_names: - episodic_data_names = self._env.custom_db_data[db_dname].data_names() - var_name = db_dname - hf.attrs[var_name+"_data_names"] = episodic_data_names - - for ep_idx in range(ep_vec_freq): # create separate datasets for each episode - ep_prefix=f'ep_{ep_idx}_' - - # rewards - hf.create_dataset(ep_prefix+'sub_rew', - data=sub_rew_full[ep_idx, :, :, :]) - hf.create_dataset(ep_prefix+'tot_rew', - data=tot_rew_full[ep_idx, :, :, :]) - if self._n_expl_envs > 0: - hf.create_dataset(ep_prefix+'sub_rew_expl', - data=sub_rew_full_expl[ep_idx, :, :, :]) - hf.create_dataset(ep_prefix+'tot_rew_expl', - data=tot_rew_full_expl[ep_idx, :, :, :]) - hf.create_dataset('expl_env_selector', data=self._expl_env_selector.cpu().numpy()) - if self._env.n_demo_envs() > 0: - hf.create_dataset(ep_prefix+'sub_rew_demo', - data=sub_rew_full_demo) - hf.create_dataset(ep_prefix+'tot_rew_demo', - data=tot_rew_full_demo[ep_idx, :, :, :]) - hf.create_dataset('demo_env_idxs', data=self._env.demo_env_idxs().cpu().numpy()) - - # dump all custom env data - db_data_names = list(self._env.custom_db_data.keys()) - for db_dname in db_data_names: - episodic_data=self._env.custom_db_data[db_dname] - var_name = db_dname - hf.create_dataset(ep_prefix+var_name, - data=episodic_data.get_full_episodic_data(env_selector=self._db_env_selector)[ep_idx, :, :, :]) - if self._n_expl_envs > 0: - hf.create_dataset(ep_prefix+var_name+"_expl", - data=episodic_data.get_full_episodic_data(env_selector=self._expl_env_selector)[ep_idx, :, :, :]) - if self._env.n_demo_envs() > 0: - hf.create_dataset(ep_prefix+var_name+"_demo", - data=episodic_data.get_full_episodic_data(env_selector=self._demo_env_selector)[ep_idx, :, :, :]) - - Journal.log(self.__class__.__name__, - "_dump_env_checkpoints", - "done.", - LogType.INFO, - throw_when_excep = True) - - def done(self): - - if not self._is_done: - - if not self._eval: - self._save_model() - - self._dump_dbinfo_to_file() - - if self._full_env_db: - self._dump_env_checkpoints() - - if self._shared_algo_data is not None: - self._shared_algo_data.write(dyn_info_name=["is_done"], - val=[1.0]) - self._shared_algo_data.close() # close shared memory - - self._env.close() - - self._is_done = True - - def _dump_dbinfo_to_file(self): - - info = f"Dumping debug info at {self._dbinfo_drop_fname}" - Journal.log(self.__class__.__name__, - "_dump_dbinfo_to_file", - info, - LogType.INFO, - throw_when_excep = True) - - with h5py.File(self._dbinfo_drop_fname+".hdf5", 'w') as hf: - n_valid = int(max(0, min(self._log_it_counter, self._db_data_size))) - - def _slice_valid(arr): - if isinstance(arr, torch.Tensor): - return arr[:n_valid] - if isinstance(arr, np.ndarray): - return arr[:n_valid] - return arr - - def _ds(name, arr): - data = _slice_valid(arr) - hf.create_dataset(name, data=data.numpy() if isinstance(data, torch.Tensor) else data) - - # hf.create_dataset('numpy_data', data=numpy_data) - # Write dictionaries to HDF5 as attributes - for key, value in self._hyperparameters.items(): - if value is None: - value = "None" - hf.attrs[key] = value - - # rewards - hf.create_dataset('sub_reward_names', data=self._reward_names, - dtype='S40') - _ds('sub_rew_max', self._sub_rew_max) - _ds('sub_rew_avrg', self._sub_rew_avrg) - _ds('sub_rew_min', self._sub_rew_min) - _ds('sub_rew_max_over_envs', self._sub_rew_max_over_envs) - _ds('sub_rew_min_over_envs', self._sub_rew_min_over_envs) - _ds('sub_rew_avrg_over_envs', self._sub_rew_avrg_over_envs) - _ds('sub_rew_std_over_envs', self._sub_rew_std_over_envs) - - _ds('tot_rew_max', self._tot_rew_max) - _ds('tot_rew_avrg', self._tot_rew_avrg) - _ds('tot_rew_min', self._tot_rew_min) - _ds('tot_rew_max_over_envs', self._tot_rew_max_over_envs) - _ds('tot_rew_min_over_envs', self._tot_rew_min_over_envs) - _ds('tot_rew_avrg_over_envs', self._tot_rew_avrg_over_envs) - _ds('tot_rew_std_over_envs', self._tot_rew_std_over_envs) - - _ds('ep_tsteps_env_distr', self._ep_tsteps_env_distribution) - - if self._n_expl_envs > 0: - # expl envs - _ds('sub_rew_max_expl', self._sub_rew_max_expl) - _ds('sub_rew_avrg_expl', self._sub_rew_avrg_expl) - _ds('sub_rew_min_expl', self._sub_rew_min_expl) - _ds('sub_rew_max_over_envs_expl', self._sub_rew_max_over_envs_expl) - _ds('sub_rew_min_over_envs_expl', self._sub_rew_min_over_envs_expl) - _ds('sub_rew_avrg_over_envs_expl', self._sub_rew_avrg_over_envs_expl) - _ds('sub_rew_std_over_envs_expl', self._sub_rew_std_over_envs_expl) - - _ds('ep_timesteps_expl_env_distr', self._ep_tsteps_expl_env_distribution) - - hf.create_dataset('expl_env_selector', data=self._expl_env_selector.numpy()) - - _ds('demo_envs_active', self._demo_envs_active) - _ds('demo_perf_metric', self._demo_perf_metric) - - if self._env.n_demo_envs() > 0: - # demo envs - _ds('sub_rew_max_demo', self._sub_rew_max_demo) - _ds('sub_rew_avrg_demo', self._sub_rew_avrg_demo) - _ds('sub_rew_min_demo', self._sub_rew_min_demo) - _ds('sub_rew_max_over_envs_demo', self._sub_rew_max_over_envs_demo) - _ds('sub_rew_min_over_envs_demo', self._sub_rew_min_over_envs_demo) - _ds('sub_rew_avrg_over_envs_demo', self._sub_rew_avrg_over_envs_demo) - _ds('sub_rew_std_over_envs_demo', self._sub_rew_std_over_envs_demo) - - _ds('ep_timesteps_demo_env_distr', self._ep_tsteps_demo_env_distribution) - - hf.create_dataset('demo_env_idxs', data=self._env.demo_env_idxs().numpy()) - - # profiling data - _ds('env_step_fps', self._env_step_fps) - _ds('env_step_rt_factor', self._env_step_rt_factor) - _ds('collection_dt', self._collection_dt) - _ds('batch_norm_update_dt', self._batch_norm_update_dt) - _ds('policy_update_dt', self._policy_update_dt) - _ds('policy_update_fps', self._policy_update_fps) - _ds('validation_dt', self._validation_dt) - - _ds('n_of_played_episodes', self._n_of_played_episodes) - _ds('n_timesteps_done', self._n_timesteps_done) - _ds('n_policy_updates', self._n_policy_updates) - _ds('n_qfun_updates', self._n_qfun_updates) - _ds('n_tqfun_updates', self._n_tqfun_updates) - - _ds('elapsed_min', self._elapsed_min) - - # algo data - _ds('qf1_vals_mean', self._qf1_vals_mean) - _ds('qf2_vals_mean', self._qf2_vals_mean) - _ds('qf1_vals_std', self._qf1_vals_std) - _ds('qf2_vals_std', self._qf2_vals_std) - _ds('qf1_vals_max', self._qf1_vals_max) - _ds('qf1_vals_min', self._qf1_vals_min) - _ds('qf2_vals_max', self._qf2_vals_max) - _ds('qf2_vals_min', self._qf1_vals_min) - - _ds('min_qft_vals_mean', self._min_qft_vals_mean) - _ds('min_qft_vals_std', self._min_qft_vals_std) - - _ds('qf1_loss', self._qf1_loss) - _ds('qf2_loss', self._qf2_loss) - _ds('actor_loss', self._actor_loss) - _ds('alpha_loss', self._alpha_loss) - _ds('alpha_loss_disc', self._alpha_loss_disc) - _ds('alpha_loss_cont', self._alpha_loss_cont) - if self._validate: - _ds('qf1_loss_validation', self._qf1_loss_validation) - _ds('qf2_loss_validation', self._qf2_loss_validation) - _ds('actor_loss_validation', self._actor_loss_validation) - _ds('alpha_loss_validation', self._alpha_loss_validation) - _ds('alpha_loss_disc_validation', self._alpha_loss_disc_validation) - _ds('alpha_loss_cont_validation', self._alpha_loss_cont_validation) - _ds('overfit_index', self._overfit_index) - - _ds('alphas', self._alphas) - _ds('alphas_disc', self._alphas_disc) - _ds('alphas_cont', self._alphas_cont) - - _ds('policy_entropy_mean', self._policy_entropy_mean) - _ds('policy_entropy_std', self._policy_entropy_std) - _ds('policy_entropy_max', self._policy_entropy_max) - _ds('policy_entropy_min', self._policy_entropy_min) - _ds('policy_entropy_disc_mean', self._policy_entropy_disc_mean) - _ds('policy_entropy_disc_std', self._policy_entropy_disc_std) - _ds('policy_entropy_disc_max', self._policy_entropy_disc_max) - _ds('policy_entropy_disc_min', self._policy_entropy_disc_min) - _ds('policy_entropy_cont_mean', self._policy_entropy_cont_mean) - _ds('policy_entropy_cont_std', self._policy_entropy_cont_std) - _ds('policy_entropy_cont_max', self._policy_entropy_cont_max) - _ds('policy_entropy_cont_min', self._policy_entropy_cont_min) - hf.create_dataset('target_entropy', data=self._target_entropy) - hf.create_dataset('target_entropy_disc', data=self._target_entropy_disc) - hf.create_dataset('target_entropy_cont', data=self._target_entropy_cont) - - if self._use_rnd: - _ds('n_rnd_updates', self._n_rnd_updates) - _ds('expl_bonus_raw_avrg', self._expl_bonus_raw_avrg) - _ds('expl_bonus_raw_std', self._expl_bonus_raw_std) - _ds('expl_bonus_proc_avrg', self._expl_bonus_proc_avrg) - _ds('expl_bonus_proc_std', self._expl_bonus_proc_std) - - if self._rnd_net.obs_running_norm is not None: - if self._running_mean_rnd_input is not None: - _ds('running_mean_rnd_input', self._running_mean_rnd_input) - if self._running_std_rnd_input is not None: - _ds('running_std_rnd_input', self._running_std_rnd_input) - - # dump all custom env data - db_data_names = list(self._env.custom_db_data.keys()) - for db_dname in db_data_names: - data=self._custom_env_data[db_dname] - subnames = list(data.keys()) - for subname in subnames: - var_name = db_dname + "_" + subname - _ds(var_name, data[subname]) - - # other data - if self._agent.obs_running_norm is not None: - if self._running_mean_obs is not None: - _ds('running_mean_obs', self._running_mean_obs) - if self._running_std_obs is not None: - _ds('running_std_obs', self._running_std_obs) - - info = f"done." - Journal.log(self.__class__.__name__, - "_dump_dbinfo_to_file", - info, - LogType.INFO, - throw_when_excep = True) - - def _load_model(self, - model_path: str): - - info = f"Loading model at {model_path}" - - Journal.log(self.__class__.__name__, - "_load_model", - info, - LogType.INFO, - throw_when_excep = True) - model_dict=torch.load(model_path, - map_location=self._torch_device, - weights_only=False) - - observed_joints=self._env.get_observed_joints() - if not ("observed_jnts" in model_dict): - Journal.log(self.__class__.__name__, - "_load_model", - "No observed joints key found in loaded model dictionary! Let's hope joints are ordered in the same way.", - LogType.WARN) - else: - required_joints=model_dict["observed_jnts"] - self._check_observed_joints(observed_joints,required_joints) - - self._agent.load_state_dict(model_dict) - - if self._eval: - self._switch_training_mode(False) - - def _check_observed_joints(self, - observed_joints, - required_joints): - - observed=set(observed_joints) - required=set(required_joints) - - all_required_joints_avail = required.issubset(observed) - if not all_required_joints_avail: - missing=[item for item in required if item not in observed] - missing_str=', '.join(missing) - Journal.log(self.__class__.__name__, - "_check_observed_joints", - f"not all required joints are available. Missing {missing_str}", - LogType.EXCEP, - throw_when_excep = True) - exceeding=observed-required - if not len(exceeding)==0: - # do not support having more joints than the required - exc_jnts=" ".join(list(exceeding)) - Journal.log(self.__class__.__name__, - "_check_observed_joints", - f"more than the required joints found in the observed joint: {exc_jnts}", - LogType.EXCEP, - throw_when_excep = True) - - # here we are sure that required and observed sets match - self._to_agent_jnt_remap=None - if not required_joints==observed_joints: - Journal.log(self.__class__.__name__, - "_check_observed_joints", - f"required jnt obs from agent have different ordering from observed ones. Will compute a remapping.", - LogType.WARN, - throw_when_excep = True) - self._to_agent_jnt_remap = [observed_joints.index(element) for element in required_joints] - - self._env.set_jnts_remapping(remapping= self._to_agent_jnt_remap) - - def drop_dir(self): - return self._drop_dir - - def _init_drop_dir(self, - drop_dir_name: str = None): - - # main drop directory - if drop_dir_name is None: - # drop to current directory - self._drop_dir = "./" + f"{self.__class__.__name__}/" + self._run_name + "/" + self._unique_id - else: - self._drop_dir = drop_dir_name + "/" + f"{self.__class__.__name__}/" + self._run_name + "/" + self._unique_id - os.makedirs(self._drop_dir) - - self._env_db_checkpoints_dropdir=None - self._env_db_checkpoints_fname=None - if self._full_env_db>0: - self._env_db_checkpoints_dropdir=self._drop_dir+"/env_db_checkpoints" - self._env_db_checkpoints_fname = self._env_db_checkpoints_dropdir + \ - "/" + self._unique_id + "_env_db_checkpoint" - os.makedirs(self._env_db_checkpoints_dropdir) - # model - if not self._eval or (self._model_path is None): - self._model_path = self._drop_dir + "/" + self._unique_id + "_model" - else: # we copy the model under evaluation to the drop dir - shutil.copy(self._model_path, self._drop_dir) - - # debug info - self._dbinfo_drop_fname = self._drop_dir + "/" + self._unique_id + "db_info" # extension added later - - # other auxiliary db files - aux_drop_dir = self._drop_dir + "/other" - os.makedirs(aux_drop_dir) - filepaths = self._env.get_file_paths() # envs implementation - filepaths.append(self._this_basepath) # algorithm implementation - filepaths.append(self._this_child_path) - filepaths.append(self._agent.get_impl_path()) # agent implementation - for file in filepaths: - shutil.copy(file, self._drop_dir) - aux_dirs = self._env.get_aux_dir() - for aux_dir in aux_dirs: - shutil.copytree(aux_dir, aux_drop_dir, dirs_exist_ok=True) - - def _get_performance_metric(self): - # to be overridden - return 0.0 - - def _post_step(self): - - self._collection_dt[self._log_it_counter] += \ - (self._collection_t-self._start_time) - self._batch_norm_update_dt[self._log_it_counter] += \ - (self._policy_update_t_start-self._collection_t) - self._policy_update_dt[self._log_it_counter] += \ - (self._policy_update_t - self._policy_update_t_start) - if self._validate: - self._validation_dt[self._log_it_counter] += \ - (self._validation_t - self._policy_update_t) - - self._step_counter+=1 # counts algo steps - - self._demo_envs_active[self._log_it_counter]=self._env.demo_active() - self._demo_perf_metric[self._log_it_counter]=self._get_performance_metric() - if self._env.n_demo_envs() > 0 and (self._demo_stop_thresh is not None): - # check if deactivation condition applies - self._env.switch_demo(active=self._demo_perf_metric[self._log_it_counter] 0: - self._ep_tsteps_expl_env_distribution[self._log_it_counter, :]=\ - self._episodic_reward_metrics.step_counters(env_selector=self._expl_env_selector)*self._env_n_action_reps - - self._sub_rew_max_expl[self._log_it_counter, :, :] = self._episodic_reward_metrics.get_sub_rew_max(env_selector=self._expl_env_selector) - self._sub_rew_avrg_expl[self._log_it_counter, :, :] = self._episodic_reward_metrics.get_sub_rew_avrg(env_selector=self._expl_env_selector) - self._sub_rew_min_expl[self._log_it_counter, :, :] = self._episodic_reward_metrics.get_sub_rew_min(env_selector=self._expl_env_selector) - self._sub_rew_max_over_envs_expl[self._log_it_counter, :, :] = self._episodic_reward_metrics.get_sub_rew_max_over_envs(env_selector=self._expl_env_selector) - self._sub_rew_min_over_envs_expl[self._log_it_counter, :, :] = self._episodic_reward_metrics.get_sub_rew_min_over_envs(env_selector=self._expl_env_selector) - self._sub_rew_avrg_over_envs_expl[self._log_it_counter, :, :] = self._episodic_reward_metrics.get_sub_rew_avrg_over_envs(env_selector=self._expl_env_selector) - self._sub_rew_std_over_envs_expl[self._log_it_counter, :, :] = self._episodic_reward_metrics.get_sub_rew_std_over_envs(env_selector=self._expl_env_selector) - - # demo envs - if self._env.n_demo_envs() > 0 and self._env.demo_active(): - # only log if demo envs are active (db data will remaing to nan if that case) - self._ep_tsteps_demo_env_distribution[self._log_it_counter, :]=\ - self._episodic_reward_metrics.step_counters(env_selector=self._demo_env_selector)*self._env_n_action_reps - - self._sub_rew_max_demo[self._log_it_counter, :, :] = self._episodic_reward_metrics.get_sub_rew_max(env_selector=self._demo_env_selector) - self._sub_rew_avrg_demo[self._log_it_counter, :, :] = self._episodic_reward_metrics.get_sub_rew_avrg(env_selector=self._demo_env_selector) - self._sub_rew_min_demo[self._log_it_counter, :, :] = self._episodic_reward_metrics.get_sub_rew_min(env_selector=self._demo_env_selector) - self._sub_rew_max_over_envs_demo[self._log_it_counter, :, :] = self._episodic_reward_metrics.get_sub_rew_max_over_envs(env_selector=self._demo_env_selector) - self._sub_rew_min_over_envs_demo[self._log_it_counter, :, :] = self._episodic_reward_metrics.get_sub_rew_min_over_envs(env_selector=self._demo_env_selector) - self._sub_rew_avrg_over_envs_demo[self._log_it_counter, :, :] = self._episodic_reward_metrics.get_sub_rew_avrg_over_envs(env_selector=self._demo_env_selector) - self._sub_rew_std_over_envs_demo[self._log_it_counter, :, :] = self._episodic_reward_metrics.get_sub_rew_std_over_envs(env_selector=self._demo_env_selector) - - # other data - if self._agent.obs_running_norm is not None: - if self._running_mean_obs is not None: - self._running_mean_obs[self._log_it_counter, :] = self._agent.obs_running_norm.get_current_mean() - if self._running_std_obs is not None: - self._running_std_obs[self._log_it_counter, :] = self._agent.obs_running_norm.get_current_std() - - if self._use_rnd: - if self._running_mean_rnd_input is not None: - self._running_mean_rnd_input[self._log_it_counter, :] = self._rnd_net.obs_running_norm.get_current_mean() - if self._running_std_rnd_input is not None: - self._running_std_rnd_input[self._log_it_counter, :] = self._rnd_net.obs_running_norm.get_current_std() - - # write some episodic db info on shared mem - sub_returns=self._sub_returns.get_torch_mirror(gpu=False) - sub_returns[:, :]=self._episodic_reward_metrics.get_sub_rew_avrg() - tot_returns=self._tot_returns.get_torch_mirror(gpu=False) - tot_returns[:, :]=self._episodic_reward_metrics.get_tot_rew_avrg() - self._sub_returns.synch_all(read=False) - self._tot_returns.synch_all(read=False) - - self._log_info() - - self._log_it_counter+=1 - - if self._dump_checkpoints and \ - (self._vec_transition_counter % self._m_checkpoint_freq == 0): - self._save_model(is_checkpoint=True) - - if self._full_env_db and \ - (self._vec_transition_counter % self._env_db_checkpoints_vecfreq == 0): - self._dump_env_checkpoints() - - if self._vec_transition_counter == self._total_timesteps_vec: - self.done() - - def _should_have_called_setup(self): - - exception = f"setup() was not called!" - - Journal.log(self.__class__.__name__, - "_should_have_called_setup", - exception, - LogType.EXCEP, - throw_when_excep = True) - - def _log_info(self): - - if self._debug or self._verbose: - elapsed_h = self._elapsed_min[self._log_it_counter].item()/60.0 - est_remaining_time_h = elapsed_h * 1/(self._vec_transition_counter) * (self._total_timesteps_vec-self._vec_transition_counter) - is_done=self._vec_transition_counter==self._total_timesteps_vec - - actual_tsteps_with_updates=-1 - experience_to_policy_grad_ratio=-1 - experience_to_qfun_grad_ratio=-1 - experience_to_tqfun_grad_ratio=-1 - if not self._eval: - actual_tsteps_with_updates=(self._n_timesteps_done[self._log_it_counter].item()-self._warmstart_timesteps) - epsi=1e-6 # to avoid div by 0 - experience_to_policy_grad_ratio=actual_tsteps_with_updates/(self._n_policy_updates[self._log_it_counter].item()-epsi) - experience_to_qfun_grad_ratio=actual_tsteps_with_updates/(self._n_qfun_updates[self._log_it_counter].item()-epsi) - experience_to_tqfun_grad_ratio=actual_tsteps_with_updates/(self._n_tqfun_updates[self._log_it_counter].item()-epsi) - - if self._debug: - - if self._remote_db: - # write general algo debug info to shared memory - info_names=self._shared_algo_data.dynamic_info.get() - info_data = [ - self._n_timesteps_done[self._log_it_counter].item(), - self._n_policy_updates[self._log_it_counter].item(), - experience_to_policy_grad_ratio, - elapsed_h, - est_remaining_time_h, - self._env_step_fps[self._log_it_counter].item(), - self._env_step_rt_factor[self._log_it_counter].item(), - self._collection_dt[self._log_it_counter].item(), - self._policy_update_fps[self._log_it_counter].item(), - self._policy_update_dt[self._log_it_counter].item(), - is_done, - self._n_of_played_episodes[self._log_it_counter].item(), - self._batch_norm_update_dt[self._log_it_counter].item(), - ] - self._shared_algo_data.write(dyn_info_name=info_names, - val=info_data) - - # write debug info to remote wandb server - db_data_names = list(self._env.custom_db_data.keys()) - for dbdatan in db_data_names: - data = self._custom_env_data[dbdatan] - data_names = self._env.custom_db_data[dbdatan].data_names() - - self._custom_env_data_db_dict.update({f"env_dbdata/{dbdatan}" + "_max": - wandb.Histogram(data["max"][self._log_it_counter, :, :].numpy())}) - self._custom_env_data_db_dict.update({f"env_dbdata/{dbdatan}" + "_avrg": - wandb.Histogram(data["avrg"][self._log_it_counter, :, :].numpy())}) - self._custom_env_data_db_dict.update({f"env_dbdata/{dbdatan}" + "_min": - wandb.Histogram(data["min"][self._log_it_counter, :, :].numpy())}) - - self._custom_env_data_db_dict.update({f"env_dbdata/{dbdatan}-{data_names[i]}" + "_max_over_envs": - data["max_over_envs"][self._log_it_counter, :, i:i+1] for i in range(len(data_names))}) - self._custom_env_data_db_dict.update({f"env_dbdata/{dbdatan}-{data_names[i]}" + "_min_over_envs": - data["min_over_envs"][self._log_it_counter, :, i:i+1] for i in range(len(data_names))}) - self._custom_env_data_db_dict.update({f"env_dbdata/{dbdatan}-{data_names[i]}" + "_avrg_over_envs": - data["avrg_over_envs"][self._log_it_counter, :, i:i+1] for i in range(len(data_names))}) - self._custom_env_data_db_dict.update({f"env_dbdata/{dbdatan}-{data_names[i]}" + "_std_over_envs": - data["std_over_envs"][self._log_it_counter, :, i:i+1] for i in range(len(data_names))}) - - self._wandb_d.update({'log_iteration' : self._log_it_counter}) - self._wandb_d.update(dict(zip(info_names, info_data))) - - # debug environments - self._wandb_d.update({'correlation_db/ep_timesteps_env_distr': - wandb.Histogram(self._ep_tsteps_env_distribution[self._log_it_counter, :, :].numpy())}) - - self._wandb_d.update({'tot_reward/tot_rew_max': wandb.Histogram(self._tot_rew_max[self._log_it_counter, :, :].numpy()), - 'tot_reward/tot_rew_avrg': wandb.Histogram(self._tot_rew_avrg[self._log_it_counter, :, :].numpy()), - 'tot_reward/tot_rew_min': wandb.Histogram(self._tot_rew_min[self._log_it_counter, :, :].numpy()), - 'tot_reward/tot_rew_max_over_envs': self._tot_rew_max_over_envs[self._log_it_counter, :, :].item(), - 'tot_reward/tot_rew_min_over_envs': self._tot_rew_min_over_envs[self._log_it_counter, :, :].item(), - 'tot_reward/tot_rew_avrg_over_envs': self._tot_rew_avrg_over_envs[self._log_it_counter, :, :].item(), - 'tot_reward/tot_rew_std_over_envs': self._tot_rew_std_over_envs[self._log_it_counter, :, :].item(), - }) - # sub rewards from db envs - self._wandb_d.update({f"sub_reward/{self._reward_names[i]}_sub_rew_max": - wandb.Histogram(self._sub_rew_max.numpy()[self._log_it_counter, :, i:i+1]) for i in range(len(self._reward_names))}) - self._wandb_d.update({f"sub_reward/{self._reward_names[i]}_sub_rew_min": - wandb.Histogram(self._sub_rew_min.numpy()[self._log_it_counter, :, i:i+1]) for i in range(len(self._reward_names))}) - self._wandb_d.update({f"sub_reward/{self._reward_names[i]}_sub_rew_avrg": - wandb.Histogram(self._sub_rew_avrg.numpy()[self._log_it_counter, :, i:i+1]) for i in range(len(self._reward_names))}) - - - self._wandb_d.update({f"sub_reward/{self._reward_names[i]}_sub_rew_max_over_envs": - self._sub_rew_max_over_envs[self._log_it_counter, :, i:i+1] for i in range(len(self._reward_names))}) - self._wandb_d.update({f"sub_reward/{self._reward_names[i]}_sub_rew_min_over_envs": - self._sub_rew_min_over_envs[self._log_it_counter, :, i:i+1] for i in range(len(self._reward_names))}) - self._wandb_d.update({f"sub_reward/{self._reward_names[i]}_sub_rew_avrg_over_envs": - self._sub_rew_avrg_over_envs[self._log_it_counter, :, i:i+1] for i in range(len(self._reward_names))}) - self._wandb_d.update({f"sub_reward/{self._reward_names[i]}_sub_rew_std_over_envs": - self._sub_rew_std_over_envs[self._log_it_counter, :, i:i+1] for i in range(len(self._reward_names))}) - - # exploration envs - if self._n_expl_envs > 0: - self._wandb_d.update({'correlation_db/ep_timesteps_expl_env_distr': - wandb.Histogram(self._ep_tsteps_expl_env_distribution[self._log_it_counter, :, :].numpy())}) - - # sub reward from expl envs - self._wandb_d.update({f"sub_reward_expl/{self._reward_names[i]}_sub_rew_max_expl": - wandb.Histogram(self._sub_rew_max_expl.numpy()[self._log_it_counter, :, i:i+1]) for i in range(len(self._reward_names))}) - self._wandb_d.update({f"sub_reward_expl/{self._reward_names[i]}_sub_rew_avrg_expl": - wandb.Histogram(self._sub_rew_avrg_expl.numpy()[self._log_it_counter, :, i:i+1]) for i in range(len(self._reward_names))}) - self._wandb_d.update({f"sub_reward_expl/{self._reward_names[i]}_sub_rew_min_expl": - wandb.Histogram(self._sub_rew_min_expl.numpy()[self._log_it_counter, :, i:i+1]) for i in range(len(self._reward_names))}) - - self._wandb_d.update({f"sub_reward_expl/{self._reward_names[i]}_sub_rew_max_over_envs_expl": - self._sub_rew_max_over_envs_expl[self._log_it_counter, :, i:i+1] for i in range(len(self._reward_names))}) - self._wandb_d.update({f"sub_reward_expl/{self._reward_names[i]}_sub_rew_min_over_envs_expl": - self._sub_rew_min_over_envs_expl[self._log_it_counter, :, i:i+1] for i in range(len(self._reward_names))}) - self._wandb_d.update({f"sub_reward_expl/{self._reward_names[i]}_sub_rew_avrg_over_envs_expl": - self._sub_rew_avrg_over_envs_expl[self._log_it_counter, :, i:i+1] for i in range(len(self._reward_names))}) - self._wandb_d.update({f"sub_reward_expl/{self._reward_names[i]}_sub_rew_std_over_envs_expl": - self._sub_rew_std_over_envs_expl[self._log_it_counter, :, i:i+1] for i in range(len(self._reward_names))}) - - # demo envs (only log if active, otherwise we log nans which wandb doesn't like) - if self._env.n_demo_envs() > 0: - if self._env.demo_active(): - # log hystograms only if there are no nan in the data - self._wandb_d.update({'correlation_db/ep_timesteps_demo_env_distr': - wandb.Histogram(self._ep_tsteps_demo_env_distribution[self._log_it_counter, :, :].numpy())}) - - # sub reward from expl envs - self._wandb_d.update({f"sub_reward_demo/{self._reward_names[i]}_sub_rew_max_demo": - wandb.Histogram(self._sub_rew_max_demo.numpy()[self._log_it_counter, :, i:i+1]) for i in range(len(self._reward_names))}) - self._wandb_d.update({f"sub_reward_demo/{self._reward_names[i]}_sub_rew_avrg_demo": - wandb.Histogram(self._sub_rew_avrg_demo.numpy()[self._log_it_counter, :, i:i+1]) for i in range(len(self._reward_names))}) - self._wandb_d.update({f"sub_reward_demo/{self._reward_names[i]}_sub_rew_min_demo": - wandb.Histogram(self._sub_rew_min_demo.numpy()[self._log_it_counter, :, i:i+1]) for i in range(len(self._reward_names))}) - - self._wandb_d.update({f"sub_reward_demo/{self._reward_names[i]}_sub_rew_max_over_envs_demo": - self._sub_rew_max_over_envs_demo[self._log_it_counter, :, i:i+1] for i in range(len(self._reward_names))}) - self._wandb_d.update({f"sub_reward_demo/{self._reward_names[i]}_sub_rew_min_over_envs_demo": - self._sub_rew_min_over_envs_demo[self._log_it_counter, :, i:i+1] for i in range(len(self._reward_names))}) - self._wandb_d.update({f"sub_reward_demo/{self._reward_names[i]}_sub_rew_avrg_over_envs_demo": - self._sub_rew_avrg_over_envs_demo[self._log_it_counter, :, i:i+1] for i in range(len(self._reward_names))}) - self._wandb_d.update({f"sub_reward_demo/{self._reward_names[i]}_sub_rew_std_over_envs_demo": - self._sub_rew_std_over_envs_demo[self._log_it_counter, :, i:i+1] for i in range(len(self._reward_names))}) - - if self._vec_transition_counter > (self._warmstart_vectimesteps-1): - # algo info - self._policy_update_db_data_dict.update({ - "sac_q_info/qf1_vals_mean": self._qf1_vals_mean[self._log_it_counter, 0], - "sac_q_info/qf2_vals_mean": self._qf2_vals_mean[self._log_it_counter, 0], - "sac_q_info/min_qft_vals_mean": self._min_qft_vals_mean[self._log_it_counter, 0], - "sac_q_info/qf1_vals_std": self._qf1_vals_std[self._log_it_counter, 0], - "sac_q_info/qf2_vals_std": self._qf2_vals_std[self._log_it_counter, 0], - "sac_q_info/min_qft_vals_std": self._min_qft_vals_std[self._log_it_counter, 0], - "sac_q_info/qf1_vals_max": self._qf1_vals_max[self._log_it_counter, 0], - "sac_q_info/qf2_vals_max": self._qf2_vals_max[self._log_it_counter, 0], - "sac_q_info/qf1_vals_min": self._qf1_vals_min[self._log_it_counter, 0], - "sac_q_info/qf2_vals_min": self._qf2_vals_min[self._log_it_counter, 0], - - "sac_actor_info/policy_entropy_mean": self._policy_entropy_mean[self._log_it_counter, 0], - "sac_actor_info/policy_entropy_std": self._policy_entropy_std[self._log_it_counter, 0], - "sac_actor_info/policy_entropy_max": self._policy_entropy_max[self._log_it_counter, 0], - "sac_actor_info/policy_entropy_min": self._policy_entropy_min[self._log_it_counter, 0], - "sac_actor_info/policy_entropy_disc_mean": self._policy_entropy_disc_mean[self._log_it_counter, 0], - "sac_actor_info/policy_entropy_disc_std": self._policy_entropy_disc_std[self._log_it_counter, 0], - "sac_actor_info/policy_entropy_disc_max": self._policy_entropy_disc_max[self._log_it_counter, 0], - "sac_actor_info/policy_entropy_disc_min": self._policy_entropy_disc_min[self._log_it_counter, 0], - "sac_actor_info/policy_entropy_cont_mean": self._policy_entropy_cont_mean[self._log_it_counter, 0], - "sac_actor_info/policy_entropy_cont_std": self._policy_entropy_cont_std[self._log_it_counter, 0], - "sac_actor_info/policy_entropy_cont_max": self._policy_entropy_cont_max[self._log_it_counter, 0], - "sac_actor_info/policy_entropy_cont_min": self._policy_entropy_cont_min[self._log_it_counter, 0], - - "sac_q_info/qf1_loss": self._qf1_loss[self._log_it_counter, 0], - "sac_q_info/qf2_loss": self._qf2_loss[self._log_it_counter, 0], - "sac_actor_info/actor_loss": self._actor_loss[self._log_it_counter, 0]}) - alpha_logs = { - "sac_alpha_info/alpha": self._alphas[self._log_it_counter, 0], - "sac_alpha_info/alpha_disc": self._alphas_disc[self._log_it_counter, 0], - "sac_alpha_info/alpha_cont": self._alphas_cont[self._log_it_counter, 0], - "sac_alpha_info/target_entropy": self._target_entropy, - "sac_alpha_info/target_entropy_disc": self._target_entropy_disc, - "sac_alpha_info/target_entropy_cont": self._target_entropy_cont - } - if self._autotune: - alpha_logs.update({ - "sac_alpha_info/alpha_loss": self._alpha_loss[self._log_it_counter, 0], - "sac_alpha_info/alpha_loss_disc": self._alpha_loss_disc[self._log_it_counter, 0], - "sac_alpha_info/alpha_loss_cont": self._alpha_loss_cont[self._log_it_counter, 0], - }) - self._policy_update_db_data_dict.update(alpha_logs) - - if self._validate: - self._policy_update_db_data_dict.update({ - "sac_q_info/qf1_loss_validation": self._qf1_loss_validation[self._log_it_counter, 0], - "sac_q_info/qf2_loss_validation": self._qf2_loss_validation[self._log_it_counter, 0], - "sac_q_info/overfit_index": self._overfit_index[self._log_it_counter, 0], - "sac_actor_info/actor_loss_validation": self._actor_loss_validation[self._log_it_counter, 0]}) - if self._autotune: - self._policy_update_db_data_dict.update({ - "sac_alpha_info/alpha_loss_validation": self._alpha_loss_validation[self._log_it_counter, 0], - "sac_alpha_info/alpha_loss_disc_validation": self._alpha_loss_disc_validation[self._log_it_counter, 0], - "sac_alpha_info/alpha_loss_cont_validation": self._alpha_loss_cont_validation[self._log_it_counter, 0]}) - - self._wandb_d.update(self._policy_update_db_data_dict) - - if self._use_rnd: - self._rnd_db_data_dict.update({ - "rnd_info/expl_bonus_raw_avrg": self._expl_bonus_raw_avrg[self._log_it_counter, 0], - "rnd_info/expl_bonus_raw_std": self._expl_bonus_raw_std[self._log_it_counter, 0], - "rnd_info/expl_bonus_proc_avrg": self._expl_bonus_proc_avrg[self._log_it_counter, 0], - "rnd_info/expl_bonus_proc_std": self._expl_bonus_proc_std[self._log_it_counter, 0], - "rnd_info/rnd_loss": self._rnd_loss[self._log_it_counter, 0], - }) - self._wandb_d.update(self._rnd_db_data_dict) - - if self._rnd_net.obs_running_norm is not None: - # adding info on running normalizer if used - if self._running_mean_rnd_input is not None: - self._wandb_d.update({f"rnd_info/running_mean_rhc_input": self._running_mean_rnd_input[self._log_it_counter, :]}) - if self._running_std_rnd_input is not None: - self._wandb_d.update({f"rnd_info/running_std_rhc_input": self._running_std_rnd_input[self._log_it_counter, :]}) - - if self._agent.obs_running_norm is not None: - # adding info on running normalizer if used - if self._running_mean_obs is not None: - self._wandb_d.update({f"running_norm/mean": self._running_mean_obs[self._log_it_counter, :]}) - if self._running_std_obs is not None: - self._wandb_d.update({f"running_norm/std": self._running_std_obs[self._log_it_counter, :]}) - - self._wandb_d.update(self._custom_env_data_db_dict) - - wandb.log(self._wandb_d) - - if self._verbose: - - info =f"\nTotal n. timesteps simulated: {self._n_timesteps_done[self._log_it_counter].item()}/{self._total_timesteps}\n" + \ - f"N. policy updates performed: {self._n_policy_updates[self._log_it_counter].item()}/{self._n_policy_updates_to_be_done}\n" + \ - f"N. q fun updates performed: {self._n_qfun_updates[self._log_it_counter].item()}/{self._n_qf_updates_to_be_done}\n" + \ - f"N. trgt q fun updates performed: {self._n_tqfun_updates[self._log_it_counter].item()}/{self._n_tqf_updates_to_be_done}\n" + \ - f"experience to policy grad ratio: {experience_to_policy_grad_ratio}\n" + \ - f"experience to q fun grad ratio: {experience_to_qfun_grad_ratio}\n" + \ - f"experience to trgt q fun grad ratio: {experience_to_tqfun_grad_ratio}\n"+ \ - f"Warmstart completed: {self._vec_transition_counter > self._warmstart_vectimesteps or self._eval} ; ({self._vec_transition_counter}/{self._warmstart_vectimesteps})\n" +\ - f"Replay buffer full: {self._replay_bf_full}; current position {self._bpos}/{self._replay_buffer_size_vec}\n" +\ - f"Validation buffer full: {self._validation_bf_full}; current position {self._bpos_val}/{self._validation_buffer_size_vec}\n" +\ - f"Elapsed time: {self._elapsed_min[self._log_it_counter].item()/60.0} h\n" + \ - f"Estimated remaining training time: " + \ - f"{est_remaining_time_h} h\n" + \ - f"Total reward episodic data --> \n" + \ - f"max: {self._tot_rew_max_over_envs[self._log_it_counter, :, :].item()}\n" + \ - f"avg: {self._tot_rew_avrg_over_envs[self._log_it_counter, :, :].item()}\n" + \ - f"min: {self._tot_rew_min_over_envs[self._log_it_counter, :, :].item()}\n" + \ - f"Episodic sub-rewards episodic data --> \nsub rewards names: {self._reward_names_str}\n" + \ - f"max: {self._sub_rew_max_over_envs[self._log_it_counter, :]}\n" + \ - f"min: {self._sub_rew_min_over_envs[self._log_it_counter, :]}\n" + \ - f"avg: {self._sub_rew_avrg_over_envs[self._log_it_counter, :]}\n" + \ - f"std: {self._sub_rew_std_over_envs[self._log_it_counter, :]}\n" + \ - f"N. of episodes on which episodic rew stats are computed: {self._n_of_played_episodes[self._log_it_counter].item()}\n" + \ - f"Current env. step sps: {self._env_step_fps[self._log_it_counter].item()}, time for experience collection {self._collection_dt[self._log_it_counter].item()} s\n" + \ - f"Current env (sub-stepping) rt factor: {self._env_step_rt_factor[self._log_it_counter].item()}\n" + \ - f"Current policy update fps: {self._policy_update_fps[self._log_it_counter].item()}, time for policy updates {self._policy_update_dt[self._log_it_counter].item()} s\n" + \ - f"Time spent updating batch normalizations {self._batch_norm_update_dt[self._log_it_counter].item()} s\n" + \ - f"Time spent for computing validation {self._validation_dt[self._log_it_counter].item()} s\n" + \ - f"Demo envs are active: {self._demo_envs_active[self._log_it_counter].item()}. N demo envs if active {self._env.n_demo_envs()}\n" + \ - f"Performance metric now: {self._demo_perf_metric[self._log_it_counter].item()}\n" + \ - f"Entropy (disc): current {float(self._policy_entropy_disc_mean[self._log_it_counter, 0]):.4f}/{self._target_entropy_disc:.4f}\n" + \ - f"Entropy (cont): current {float(self._policy_entropy_cont_mean[self._log_it_counter, 0]):.4f}/{self._target_entropy_cont:.4f}\n" - if self._use_rnd: - info = info + f"N. rnd updates performed: {self._n_rnd_updates[self._log_it_counter].item()}\n" - - Journal.log(self.__class__.__name__, - "_post_step", - info, - LogType.INFO, - throw_when_excep = True) - - def _add_experience(self, - obs: torch.Tensor, actions: torch.Tensor, rewards: torch.Tensor, - next_obs: torch.Tensor, - next_terminal: torch.Tensor) -> None: - - if self._validate and \ - (self._vec_transition_counter % self._validation_collection_vecfreq == 0): - # fill validation buffer - - self._obs_val[self._bpos_val] = obs - self._next_obs_val[self._bpos_val] = next_obs - self._actions_val[self._bpos_val] = actions - self._rewards_val[self._bpos_val] = rewards - self._next_terminal_val[self._bpos_val] = next_terminal - - self._bpos_val += 1 - if self._bpos_val == self._validation_buffer_size_vec: - self._validation_bf_full = True - self._bpos_val = 0 - - else: # fill normal replay buffer - self._obs[self._bpos] = obs - self._next_obs[self._bpos] = next_obs - self._actions[self._bpos] = actions - self._rewards[self._bpos] = rewards - self._next_terminal[self._bpos] = next_terminal - - self._bpos += 1 - if self._bpos == self._replay_buffer_size_vec: - self._replay_bf_full = True - self._bpos = 0 - - def _sample(self, size: int = None): - - if size is None: - size=self._batch_size - - batched_obs = self._obs.view((-1, self._env.obs_dim())) - batched_next_obs = self._next_obs.view((-1, self._env.obs_dim())) - batched_actions = self._actions.view((-1, self._env.actions_dim())) - batched_rewards = self._rewards.view(-1) - batched_terminal = self._next_terminal.view(-1) - - # sampling from the batched buffer - up_to = self._replay_buffer_size if self._replay_bf_full else self._bpos*self._num_envs - shuffled_buffer_idxs = torch.randint(0, up_to, - (size,)) - - sampled_obs = batched_obs[shuffled_buffer_idxs] - sampled_next_obs = batched_next_obs[shuffled_buffer_idxs] - sampled_actions = batched_actions[shuffled_buffer_idxs] - sampled_rewards = batched_rewards[shuffled_buffer_idxs] - sampled_terminal = batched_terminal[shuffled_buffer_idxs] - - return sampled_obs, sampled_actions,\ - sampled_next_obs,\ - sampled_rewards, \ - sampled_terminal - - def _sample_validation(self, size: int = None): - - if size is None: - size=self._validation_batch_size - - batched_obs = self._obs_val.view((-1, self._env.obs_dim())) - batched_next_obs = self._next_obs_val.view((-1, self._env.obs_dim())) - batched_actions = self._actions_val.view((-1, self._env.actions_dim())) - batched_rewards = self._rewards_val.view(-1) - batched_terminal = self._next_terminal_val.view(-1) - - # sampling from the batched buffer - up_to = self._validation_buffer_size if self._validation_bf_full else self._bpos_val*self._num_envs - shuffled_buffer_idxs = torch.randint(0, up_to, - (size,)) - - sampled_obs = batched_obs[shuffled_buffer_idxs] - sampled_next_obs = batched_next_obs[shuffled_buffer_idxs] - sampled_actions = batched_actions[shuffled_buffer_idxs] - sampled_rewards = batched_rewards[shuffled_buffer_idxs] - sampled_terminal = batched_terminal[shuffled_buffer_idxs] - - return sampled_obs, sampled_actions,\ - sampled_next_obs,\ - sampled_rewards, \ - sampled_terminal - - def _sample_random_actions(self): - - self._random_uniform.uniform_(-1,1) - random_actions = self._random_uniform - - return random_actions - - def _perturb_some_actions(self, - actions: torch.Tensor): - - if self._is_continuous_actions_bool.any(): # if there are any continuous actions - self._perturb_actions(actions, - action_idxs=self._is_continuous_actions, - env_idxs=self._expl_env_selector.to(actions.device), - normal=True, # use normal for continuous - scaling=self._continuous_act_expl_noise_std) - if self._is_discrete_actions_bool.any(): # actions to be treated as discrete - self._perturb_actions(actions, - action_idxs=self._is_discrete_actions, - env_idxs=self._expl_env_selector.to(actions.device), - normal=False, # use uniform distr for discrete - scaling=self._discrete_act_expl_noise_std) - self._pert_counter+=1 - if self._pert_counter >= self._noise_duration_vec: - self._pert_counter=0 - - def _perturb_actions(self, - actions: torch.Tensor, - action_idxs: torch.Tensor, - env_idxs: torch.Tensor, - normal: bool = True, - scaling: float = 1.0): - if normal: # gaussian - # not super efficient (in theory random_normal can be made smaller in size) - self._random_normal.normal_(mean=0, std=1) - noise=self._random_normal - else: # uniform - self._random_uniform.uniform_(-1,1) - noise=self._random_uniform - - env_indices = env_idxs.reshape(-1,1) # Get indices of True environments - action_indices = action_idxs.reshape(1,-1) # Get indices of True actions - action_indices_flat=action_indices.flatten() - - perturbation=noise[env_indices, action_indices]*scaling - perturbed_actions=actions[env_indices, action_indices]+perturbation - perturbed_actions.clamp_(-1.0, 1.0) # enforce normalized bounds - - actions[env_indices, action_indices]=\ - perturbed_actions - - def _update_batch_norm(self, bsize: int = None): - - if bsize is None: - bsize=self._batch_size # same used for training - - up_to = self._replay_buffer_size if self._replay_bf_full else self._bpos*self._num_envs - shuffled_buffer_idxs = torch.randint(0, up_to, - (bsize,)) - - # update obs normalization - # (we should sample also next obs, but if most of the transitions are not terminal, - # this is not an issue and is more efficient) - if (self._agent.obs_running_norm is not None) and \ - (not self._eval): - batched_obs = self._obs.view((-1, self._obs_dim)) - sampled_obs = batched_obs[shuffled_buffer_idxs] - self._agent.update_obs_bnorm(x=sampled_obs) - - if self._use_rnd: # update running norm for RND - batched_obs = self._obs.view((-1, self._obs_dim)) - batched_actions = self._actions.view((-1, self._actions_dim)) - sampled_obs = batched_obs[shuffled_buffer_idxs] - sampled_actions = batched_actions[shuffled_buffer_idxs] - torch.cat(tensors=(sampled_obs, sampled_actions), dim=1, - out=self._rnd_bnorm_input) - self._rnd_net.update_input_bnorm(x=self._rnd_bnorm_input) - - # update running norm on rewards also - # if self._reward_normalizer is not None: - # batched_rew = self._rewards.view(-1) - # sampled_rew = batched_rew[shuffled_buffer_idxs] - # self._reward_normalizer.manual_stat_update(x=sampled_rew) - - def _reset_agent(self): - # not super efficient, but effective -> - # brand new agent, brand new optimizers - self._agent.reset() - # forcing deallocation of previous optimizers - import gc - del self._qf_optimizer - del self._actor_optimizer - if self._autotune: - del self._a_optimizer_disc - del self._a_optimizer_cont - del self._log_alpha_disc - del self._log_alpha_cont - gc.collect() - self._init_agent_optimizers() - if self._autotune: # also reinitialize alpha optimization - self._init_alpha_autotuning() - - self._overfit_idx=0.0 - - def _switch_training_mode(self, - train: bool = True): - - self._agent.train(train) - - def _init_algo_shared_data(self, - static_params: Dict): - - self._shared_algo_data = SharedRLAlgorithmInfo(namespace=self._ns, - is_server=True, - static_params=static_params, - verbose=self._verbose, - vlevel=VLevel.V2, - safe=False, - force_reconnection=True) - - self._shared_algo_data.run() - - # write some initializations - self._shared_algo_data.write(dyn_info_name=["is_done"], - val=[0.0]) - - # only written to if flags where enabled - self._qf_vals=QfVal(namespace=self._ns, - is_server=True, - n_envs=self._num_envs, - verbose=self._verbose, - vlevel=VLevel.V2, - safe=False, - force_reconnection=True) - self._qf_vals.run() - self._qf_trgt=QfTrgt(namespace=self._ns, - is_server=True, - n_envs=self._num_envs, - verbose=self._verbose, - vlevel=VLevel.V2, - safe=False, - force_reconnection=True) - self._qf_trgt.run() - - # episodic returns - reward_names=self._episodic_reward_metrics.data_names() - self._sub_returns=SubReturns(namespace=self._ns, - is_server=True, - n_envs=self._num_envs, - n_rewards=len(reward_names), - reward_names=reward_names, - verbose=self._verbose, - vlevel=VLevel.V2, - safe=False, - force_reconnection=True) - self._sub_returns.run() - - self._tot_returns=TotReturns(namespace=self._ns, - is_server=True, - n_envs=self._num_envs, - verbose=self._verbose, - vlevel=VLevel.V2, - safe=False, - force_reconnection=True) - self._tot_returns.run()