| | import time |
| | import functools |
| | import tensorflow as tf |
| |
|
| | from baselines import logger |
| |
|
| | from baselines.common import set_global_seeds, explained_variance |
| | from baselines.common import tf_util |
| | from baselines.common.policies import build_policy |
| |
|
| |
|
| | from baselines.a2c.utils import Scheduler, find_trainable_variables |
| | from baselines.a2c.runner import Runner |
| | from baselines.ppo2.ppo2 import safemean |
| | from collections import deque |
| |
|
| | from tensorflow import losses |
| |
|
| | class Model(object): |
| |
|
| | """ |
| | We use this class to : |
| | __init__: |
| | - Creates the step_model |
| | - Creates the train_model |
| | |
| | train(): |
| | - Make the training part (feedforward and retropropagation of gradients) |
| | |
| | save/load(): |
| | - Save load the model |
| | """ |
| | def __init__(self, policy, env, nsteps, |
| | ent_coef=0.01, vf_coef=0.5, max_grad_norm=0.5, lr=7e-4, |
| | alpha=0.99, epsilon=1e-5, total_timesteps=int(80e6), lrschedule='linear'): |
| |
|
| | sess = tf_util.get_session() |
| | nenvs = env.num_envs |
| | nbatch = nenvs*nsteps |
| |
|
| |
|
| | with tf.compat.v1.variable_scope('a2c_model', reuse=tf.compat.v1.AUTO_REUSE): |
| | |
| | step_model = policy(nenvs, 1, sess) |
| |
|
| | |
| | train_model = policy(nbatch, nsteps, sess) |
| |
|
| | A = tf.compat.v1.placeholder(train_model.action.dtype, train_model.action.shape) |
| | ADV = tf.compat.v1.placeholder(tf.float32, [nbatch]) |
| | R = tf.compat.v1.placeholder(tf.float32, [nbatch]) |
| | LR = tf.compat.v1.placeholder(tf.float32, []) |
| |
|
| | |
| | |
| |
|
| | |
| | neglogpac = train_model.pd.neglogp(A) |
| | |
| | pg_loss = tf.reduce_mean(input_tensor=ADV * neglogpac) |
| |
|
| | |
| | entropy = tf.reduce_mean(input_tensor=train_model.pd.entropy()) |
| |
|
| | |
| | vf_loss = losses.mean_squared_error(tf.squeeze(train_model.vf), R) |
| |
|
| | loss = pg_loss - entropy*ent_coef + vf_loss * vf_coef |
| |
|
| | |
| | |
| | params = find_trainable_variables("a2c_model") |
| |
|
| | |
| | grads = tf.gradients(ys=loss, xs=params) |
| | if max_grad_norm is not None: |
| | |
| | grads, grad_norm = tf.clip_by_global_norm(grads, max_grad_norm) |
| | grads = list(zip(grads, params)) |
| | |
| | |
| |
|
| | |
| | trainer = tf.compat.v1.train.RMSPropOptimizer(learning_rate=LR, decay=alpha, epsilon=epsilon) |
| |
|
| | _train = trainer.apply_gradients(grads) |
| |
|
| | lr = Scheduler(v=lr, nvalues=total_timesteps, schedule=lrschedule) |
| |
|
| | def train(obs, states, rewards, masks, actions, values): |
| | |
| | |
| | advs = rewards - values |
| | for step in range(len(obs)): |
| | cur_lr = lr.value() |
| |
|
| | td_map = {train_model.X:obs, A:actions, ADV:advs, R:rewards, LR:cur_lr} |
| | if states is not None: |
| | td_map[train_model.S] = states |
| | td_map[train_model.M] = masks |
| | policy_loss, value_loss, policy_entropy, _ = sess.run( |
| | [pg_loss, vf_loss, entropy, _train], |
| | td_map |
| | ) |
| | return policy_loss, value_loss, policy_entropy |
| |
|
| |
|
| | self.train = train |
| | self.train_model = train_model |
| | self.step_model = step_model |
| | self.step = step_model.step |
| | self.value = step_model.value |
| | self.initial_state = step_model.initial_state |
| | self.save = functools.partial(tf_util.save_variables, sess=sess) |
| | self.load = functools.partial(tf_util.load_variables, sess=sess) |
| | tf.compat.v1.global_variables_initializer().run(session=sess) |
| |
|
| |
|
| | def learn( |
| | network, |
| | env, |
| | seed=None, |
| | nsteps=5, |
| | total_timesteps=int(80e6), |
| | vf_coef=0.5, |
| | ent_coef=0.01, |
| | max_grad_norm=0.5, |
| | lr=7e-4, |
| | lrschedule='linear', |
| | epsilon=1e-5, |
| | alpha=0.99, |
| | gamma=0.99, |
| | log_interval=100, |
| | load_path=None, |
| | **network_kwargs): |
| |
|
| | ''' |
| | Main entrypoint for A2C algorithm. Train a policy with given network architecture on a given environment using a2c algorithm. |
| | |
| | Parameters: |
| | ----------- |
| | |
| | network: policy network architecture. Either string (mlp, lstm, lnlstm, cnn_lstm, cnn, cnn_small, conv_only - see baselines.common/models.py for full list) |
| | specifying the standard network architecture, or a function that takes tensorflow tensor as input and returns |
| | tuple (output_tensor, extra_feed) where output tensor is the last network layer output, extra_feed is None for feed-forward |
| | neural nets, and extra_feed is a dictionary describing how to feed state into the network for recurrent neural nets. |
| | See baselines.common/policies.py/lstm for more details on using recurrent nets in policies |
| | |
| | |
| | env: RL environment. Should implement interface similar to VecEnv (baselines.common/vec_env) or be wrapped with DummyVecEnv (baselines.common/vec_env/dummy_vec_env.py) |
| | |
| | |
| | seed: seed to make random number sequence in the alorightm reproducible. By default is None which means seed from system noise generator (not reproducible) |
| | |
| | nsteps: int, number of steps of the vectorized environment per update (i.e. batch size is nsteps * nenv where |
| | nenv is number of environment copies simulated in parallel) |
| | |
| | total_timesteps: int, total number of timesteps to train on (default: 80M) |
| | |
| | vf_coef: float, coefficient in front of value function loss in the total loss function (default: 0.5) |
| | |
| | ent_coef: float, coeffictiant in front of the policy entropy in the total loss function (default: 0.01) |
| | |
| | max_gradient_norm: float, gradient is clipped to have global L2 norm no more than this value (default: 0.5) |
| | |
| | lr: float, learning rate for RMSProp (current implementation has RMSProp hardcoded in) (default: 7e-4) |
| | |
| | lrschedule: schedule of learning rate. Can be 'linear', 'constant', or a function [0..1] -> [0..1] that takes fraction of the training progress as input and |
| | returns fraction of the learning rate (specified as lr) as output |
| | |
| | epsilon: float, RMSProp epsilon (stabilizes square root computation in denominator of RMSProp update) (default: 1e-5) |
| | |
| | alpha: float, RMSProp decay parameter (default: 0.99) |
| | |
| | gamma: float, reward discounting parameter (default: 0.99) |
| | |
| | log_interval: int, specifies how frequently the logs are printed out (default: 100) |
| | |
| | **network_kwargs: keyword arguments to the policy / network builder. See baselines.common/policies.py/build_policy and arguments to a particular type of network |
| | For instance, 'mlp' network architecture has arguments num_hidden and num_layers. |
| | |
| | ''' |
| |
|
| |
|
| |
|
| | set_global_seeds(seed) |
| |
|
| | |
| | nenvs = env.num_envs |
| | policy = build_policy(env, network, **network_kwargs) |
| |
|
| | |
| | model = Model(policy=policy, env=env, nsteps=nsteps, ent_coef=ent_coef, vf_coef=vf_coef, |
| | max_grad_norm=max_grad_norm, lr=lr, alpha=alpha, epsilon=epsilon, total_timesteps=total_timesteps, lrschedule=lrschedule) |
| | if load_path is not None: |
| | model.load(load_path) |
| |
|
| | |
| | runner = Runner(env, model, nsteps=nsteps, gamma=gamma) |
| | epinfobuf = deque(maxlen=100) |
| |
|
| | |
| | nbatch = nenvs*nsteps |
| |
|
| | |
| | tstart = time.time() |
| |
|
| | for update in range(1, total_timesteps//nbatch+1): |
| | |
| | obs, states, rewards, masks, actions, values, epinfos = runner.run() |
| | epinfobuf.extend(epinfos) |
| |
|
| | policy_loss, value_loss, policy_entropy = model.train(obs, states, rewards, masks, actions, values) |
| | nseconds = time.time()-tstart |
| |
|
| | |
| | fps = int((update*nbatch)/nseconds) |
| | if update % log_interval == 0 or update == 1: |
| | |
| | |
| | ev = explained_variance(values, rewards) |
| | logger.record_tabular("nupdates", update) |
| | logger.record_tabular("total_timesteps", update*nbatch) |
| | logger.record_tabular("fps", fps) |
| | logger.record_tabular("policy_entropy", float(policy_entropy)) |
| | logger.record_tabular("value_loss", float(value_loss)) |
| | logger.record_tabular("explained_variance", float(ev)) |
| | logger.record_tabular("eprewmean", safemean([epinfo['r'] for epinfo in epinfobuf])) |
| | logger.record_tabular("eplenmean", safemean([epinfo['l'] for epinfo in epinfobuf])) |
| | logger.dump_tabular() |
| | return model |
| |
|
| |
|