| | import os |
| | import time |
| | from collections import deque |
| | import pickle |
| |
|
| | from baselines.ddpg.ddpg_learner import DDPG |
| | from baselines.ddpg.models import Actor, Critic |
| | from baselines.ddpg.memory import Memory |
| | from baselines.ddpg.noise import AdaptiveParamNoiseSpec, NormalActionNoise, OrnsteinUhlenbeckActionNoise |
| | from baselines.common import set_global_seeds |
| | import baselines.common.tf_util as U |
| |
|
| | from baselines import logger |
| | import numpy as np |
| |
|
| | try: |
| | from mpi4py import MPI |
| | except ImportError: |
| | MPI = None |
| |
|
| | def learn(network, env, |
| | seed=None, |
| | total_timesteps=None, |
| | nb_epochs=None, |
| | nb_epoch_cycles=20, |
| | nb_rollout_steps=100, |
| | reward_scale=1.0, |
| | render=False, |
| | render_eval=False, |
| | noise_type='adaptive-param_0.2', |
| | normalize_returns=False, |
| | normalize_observations=True, |
| | critic_l2_reg=1e-2, |
| | actor_lr=1e-4, |
| | critic_lr=1e-3, |
| | popart=False, |
| | gamma=0.99, |
| | clip_norm=None, |
| | nb_train_steps=50, |
| | nb_eval_steps=100, |
| | batch_size=64, |
| | tau=0.01, |
| | eval_env=None, |
| | param_noise_adaption_interval=50, |
| | **network_kwargs): |
| |
|
| | set_global_seeds(seed) |
| |
|
| | if total_timesteps is not None: |
| | assert nb_epochs is None |
| | nb_epochs = int(total_timesteps) // (nb_epoch_cycles * nb_rollout_steps) |
| | else: |
| | nb_epochs = 500 |
| |
|
| | if MPI is not None: |
| | rank = MPI.COMM_WORLD.Get_rank() |
| | else: |
| | rank = 0 |
| |
|
| | nb_actions = env.action_space.shape[-1] |
| | assert (np.abs(env.action_space.low) == env.action_space.high).all() |
| |
|
| | memory = Memory(limit=int(1e6), action_shape=env.action_space.shape, observation_shape=env.observation_space.shape) |
| | critic = Critic(network=network, **network_kwargs) |
| | actor = Actor(nb_actions, network=network, **network_kwargs) |
| |
|
| | action_noise = None |
| | param_noise = None |
| | if noise_type is not None: |
| | for current_noise_type in noise_type.split(','): |
| | current_noise_type = current_noise_type.strip() |
| | if current_noise_type == 'none': |
| | pass |
| | elif 'adaptive-param' in current_noise_type: |
| | _, stddev = current_noise_type.split('_') |
| | param_noise = AdaptiveParamNoiseSpec(initial_stddev=float(stddev), desired_action_stddev=float(stddev)) |
| | elif 'normal' in current_noise_type: |
| | _, stddev = current_noise_type.split('_') |
| | action_noise = NormalActionNoise(mu=np.zeros(nb_actions), sigma=float(stddev) * np.ones(nb_actions)) |
| | elif 'ou' in current_noise_type: |
| | _, stddev = current_noise_type.split('_') |
| | action_noise = OrnsteinUhlenbeckActionNoise(mu=np.zeros(nb_actions), sigma=float(stddev) * np.ones(nb_actions)) |
| | else: |
| | raise RuntimeError('unknown noise type "{}"'.format(current_noise_type)) |
| |
|
| | max_action = env.action_space.high |
| | logger.info('scaling actions by {} before executing in env'.format(max_action)) |
| |
|
| | agent = DDPG(actor, critic, memory, env.observation_space.shape, env.action_space.shape, |
| | gamma=gamma, tau=tau, normalize_returns=normalize_returns, normalize_observations=normalize_observations, |
| | batch_size=batch_size, action_noise=action_noise, param_noise=param_noise, critic_l2_reg=critic_l2_reg, |
| | actor_lr=actor_lr, critic_lr=critic_lr, enable_popart=popart, clip_norm=clip_norm, |
| | reward_scale=reward_scale) |
| | logger.info('Using agent with the following configuration:') |
| | logger.info(str(agent.__dict__.items())) |
| |
|
| | eval_episode_rewards_history = deque(maxlen=100) |
| | episode_rewards_history = deque(maxlen=100) |
| | sess = U.get_session() |
| | |
| | agent.initialize(sess) |
| | sess.graph.finalize() |
| |
|
| | agent.reset() |
| |
|
| | obs = env.reset() |
| | if eval_env is not None: |
| | eval_obs = eval_env.reset() |
| | nenvs = obs.shape[0] |
| |
|
| | episode_reward = np.zeros(nenvs, dtype = np.float32) |
| | episode_step = np.zeros(nenvs, dtype = int) |
| | episodes = 0 |
| | t = 0 |
| |
|
| | epoch = 0 |
| |
|
| |
|
| |
|
| | start_time = time.time() |
| |
|
| | epoch_episode_rewards = [] |
| | epoch_episode_steps = [] |
| | epoch_actions = [] |
| | epoch_qs = [] |
| | epoch_episodes = 0 |
| | for epoch in range(nb_epochs): |
| | for cycle in range(nb_epoch_cycles): |
| | |
| | if nenvs > 1: |
| | |
| | |
| | agent.reset() |
| | for t_rollout in range(nb_rollout_steps): |
| | |
| | action, q, _, _ = agent.step(obs, apply_noise=True, compute_Q=True) |
| |
|
| | |
| | if rank == 0 and render: |
| | env.render() |
| |
|
| | |
| | new_obs, r, done, info = env.step(max_action * action) |
| | |
| |
|
| | t += 1 |
| | if rank == 0 and render: |
| | env.render() |
| | episode_reward += r |
| | episode_step += 1 |
| |
|
| | |
| | epoch_actions.append(action) |
| | epoch_qs.append(q) |
| | agent.store_transition(obs, action, r, new_obs, done) |
| |
|
| | obs = new_obs |
| |
|
| | for d in range(len(done)): |
| | if done[d]: |
| | |
| | epoch_episode_rewards.append(episode_reward[d]) |
| | episode_rewards_history.append(episode_reward[d]) |
| | epoch_episode_steps.append(episode_step[d]) |
| | episode_reward[d] = 0. |
| | episode_step[d] = 0 |
| | epoch_episodes += 1 |
| | episodes += 1 |
| | if nenvs == 1: |
| | agent.reset() |
| |
|
| |
|
| |
|
| | |
| | epoch_actor_losses = [] |
| | epoch_critic_losses = [] |
| | epoch_adaptive_distances = [] |
| | for t_train in range(nb_train_steps): |
| | |
| | if memory.nb_entries >= batch_size and t_train % param_noise_adaption_interval == 0: |
| | distance = agent.adapt_param_noise() |
| | epoch_adaptive_distances.append(distance) |
| |
|
| | cl, al = agent.train() |
| | epoch_critic_losses.append(cl) |
| | epoch_actor_losses.append(al) |
| | agent.update_target_net() |
| |
|
| | |
| | eval_episode_rewards = [] |
| | eval_qs = [] |
| | if eval_env is not None: |
| | nenvs_eval = eval_obs.shape[0] |
| | eval_episode_reward = np.zeros(nenvs_eval, dtype = np.float32) |
| | for t_rollout in range(nb_eval_steps): |
| | eval_action, eval_q, _, _ = agent.step(eval_obs, apply_noise=False, compute_Q=True) |
| | eval_obs, eval_r, eval_done, eval_info = eval_env.step(max_action * eval_action) |
| | if render_eval: |
| | eval_env.render() |
| | eval_episode_reward += eval_r |
| |
|
| | eval_qs.append(eval_q) |
| | for d in range(len(eval_done)): |
| | if eval_done[d]: |
| | eval_episode_rewards.append(eval_episode_reward[d]) |
| | eval_episode_rewards_history.append(eval_episode_reward[d]) |
| | eval_episode_reward[d] = 0.0 |
| |
|
| | if MPI is not None: |
| | mpi_size = MPI.COMM_WORLD.Get_size() |
| | else: |
| | mpi_size = 1 |
| |
|
| | |
| | |
| | duration = time.time() - start_time |
| | stats = agent.get_stats() |
| | combined_stats = stats.copy() |
| | combined_stats['rollout/return'] = np.mean(epoch_episode_rewards) |
| | combined_stats['rollout/return_std'] = np.std(epoch_episode_rewards) |
| | combined_stats['rollout/return_history'] = np.mean(episode_rewards_history) |
| | combined_stats['rollout/return_history_std'] = np.std(episode_rewards_history) |
| | combined_stats['rollout/episode_steps'] = np.mean(epoch_episode_steps) |
| | combined_stats['rollout/actions_mean'] = np.mean(epoch_actions) |
| | combined_stats['rollout/Q_mean'] = np.mean(epoch_qs) |
| | combined_stats['train/loss_actor'] = np.mean(epoch_actor_losses) |
| | combined_stats['train/loss_critic'] = np.mean(epoch_critic_losses) |
| | combined_stats['train/param_noise_distance'] = np.mean(epoch_adaptive_distances) |
| | combined_stats['total/duration'] = duration |
| | combined_stats['total/steps_per_second'] = float(t) / float(duration) |
| | combined_stats['total/episodes'] = episodes |
| | combined_stats['rollout/episodes'] = epoch_episodes |
| | combined_stats['rollout/actions_std'] = np.std(epoch_actions) |
| | |
| | if eval_env is not None: |
| | combined_stats['eval/return'] = eval_episode_rewards |
| | combined_stats['eval/return_history'] = np.mean(eval_episode_rewards_history) |
| | combined_stats['eval/Q'] = eval_qs |
| | combined_stats['eval/episodes'] = len(eval_episode_rewards) |
| | def as_scalar(x): |
| | if isinstance(x, np.ndarray): |
| | assert x.size == 1 |
| | return x[0] |
| | elif np.isscalar(x): |
| | return x |
| | else: |
| | raise ValueError('expected scalar, got %s'%x) |
| |
|
| | combined_stats_sums = np.array([ np.array(x).flatten()[0] for x in combined_stats.values()]) |
| | if MPI is not None: |
| | combined_stats_sums = MPI.COMM_WORLD.allreduce(combined_stats_sums) |
| |
|
| | combined_stats = {k : v / mpi_size for (k,v) in zip(combined_stats.keys(), combined_stats_sums)} |
| |
|
| | |
| | combined_stats['total/epochs'] = epoch + 1 |
| | combined_stats['total/steps'] = t |
| |
|
| | for key in sorted(combined_stats.keys()): |
| | logger.record_tabular(key, combined_stats[key]) |
| |
|
| | if rank == 0: |
| | logger.dump_tabular() |
| | logger.info('') |
| | logdir = logger.get_dir() |
| | if rank == 0 and logdir: |
| | if hasattr(env, 'get_state'): |
| | with open(os.path.join(logdir, 'env_state.pkl'), 'wb') as f: |
| | pickle.dump(env.get_state(), f) |
| | if eval_env and hasattr(eval_env, 'get_state'): |
| | with open(os.path.join(logdir, 'eval_env_state.pkl'), 'wb') as f: |
| | pickle.dump(eval_env.get_state(), f) |
| |
|
| |
|
| | return agent |
| |
|