Spaces:
Sleeping
Sleeping
| from typing import Union, Optional, List, Any, Callable, Tuple | |
| import torch | |
| from ding.config import compile_config, read_config | |
| from ding.envs import get_vec_env_setting | |
| from ding.policy import create_policy | |
| from ding.utils import set_pkg_seed | |
| from ding.torch_utils import to_tensor, to_ndarray, tensor_to_list | |
| def eval( | |
| input_cfg: Union[str, Tuple[dict, dict]], | |
| seed: int = 0, | |
| model: Optional[torch.nn.Module] = None, | |
| state_dict: Optional[dict] = None, | |
| replay_path: Optional[str] = './video', | |
| ) -> float: | |
| r""" | |
| Overview: | |
| The evaluation entry for NGU policy. | |
| Arguments: | |
| - input_cfg (:obj:`Union[str, Tuple[dict, dict]]`): Config in dict type. \ | |
| ``str`` type means config file path. \ | |
| ``Tuple[dict, dict]`` type means [user_config, create_cfg]. | |
| - seed (:obj:`int`): Random seed. | |
| - env_setting (:obj:`Optional[List[Any]]`): A list with 3 elements: \ | |
| ``BaseEnv`` subclass, collector env config, and evaluator env config. | |
| - model (:obj:`Optional[torch.nn.Module]`): Instance of torch.nn.Module. | |
| - state_dict (:obj:`Optional[dict]`): The state_dict of policy or model. | |
| """ | |
| if isinstance(input_cfg, str): | |
| cfg, create_cfg = read_config(input_cfg) | |
| else: | |
| cfg, create_cfg = input_cfg | |
| create_cfg.policy.type += '_command' | |
| cfg = compile_config(cfg, auto=True, create_cfg=create_cfg) | |
| env_fn, _, evaluator_env_cfg = get_vec_env_setting(cfg.env) | |
| env = env_fn(evaluator_env_cfg[0]) | |
| env.seed(seed, dynamic_seed=False) | |
| set_pkg_seed(seed, use_cuda=cfg.policy.cuda) | |
| policy = create_policy(cfg.policy, model=model, enable_field=['eval']).eval_mode | |
| if state_dict is None: | |
| state_dict = torch.load(cfg.learner.load_path, map_location='cpu') | |
| policy.load_state_dict(state_dict) | |
| env.enable_save_replay(replay_path=replay_path) | |
| obs = env.reset() | |
| obs = {0: obs} | |
| episode_return = 0. | |
| beta_index = {i: 0 for i in range(1)} | |
| beta_index = to_tensor(beta_index, dtype=torch.int64) | |
| prev_action = {i: torch.tensor(-1) for i in range(1)} | |
| prev_reward_e = {i: to_tensor(0, dtype=torch.float32) for i in range(1)} | |
| while True: | |
| # TODO(pu): r_i, reward embedding | |
| policy_output = policy.forward(beta_index, obs, prev_action, prev_reward_e) | |
| actions = {i: a['action'] for i, a in policy_output.items()} | |
| actions = to_ndarray(actions) | |
| action = policy_output[0]['action'] | |
| action = to_ndarray(action) | |
| timestep = env.step(action) | |
| # print(action) | |
| # print(timestep.reward) | |
| timesteps = {0: timestep} | |
| timesteps = to_tensor(timesteps, dtype=torch.float32) | |
| prev_reward_e = {env_id: timestep.reward for env_id, timestep in timesteps.items()} | |
| prev_reward_e = to_ndarray(prev_reward_e) | |
| prev_action = actions | |
| timestep = timesteps[0] | |
| # print(timestep.info) | |
| episode_return += timestep.reward | |
| obs = timestep.obs | |
| obs = {0: obs} | |
| if timestep.done: | |
| print(timestep.info) | |
| break | |
| print('Eval is over! The performance of your RL policy is {}'.format(episode_return)) | |
| if __name__ == "__main__": | |
| # Users should add their own model path here. Model path should lead to a model. | |
| # Absolute path is recommended. | |
| # In DI-engine, it is ``exp_name/ckpt/ckpt_best.pth.tar``. | |
| model_path = './debug_minigrid_doorkey_ngu_ul298_er01_n32_rbs3e4_fixepseval/ckpt/ckpt_best.pth.tar', | |
| # model_path = 'model_path_placeholder', | |
| cfg = '../config/minigrid_ngu_config.py' | |
| state_dict = torch.load(model_path, map_location='cpu') | |
| for i in range(0, 10): | |
| eval(cfg, seed=i, state_dict=state_dict, replay_path='./video') | |