| | import tensorflow as tf |
| | import functools |
| |
|
| | from baselines.common.tf_util import get_session, save_variables, load_variables |
| | from baselines.common.tf_util import initialize |
| |
|
| | try: |
| | from baselines.common.mpi_adam_optimizer import MpiAdamOptimizer |
| | from mpi4py import MPI |
| | from baselines.common.mpi_util import sync_from_root |
| | except ImportError: |
| | MPI = None |
| |
|
| | class Model(object): |
| | """ |
| | We use this object to : |
| | __init__: |
| | - Creates the step_model |
| | - Creates the train_model |
| | |
| | train(): |
| | - Make the training part (feedforward and retropropagation of gradients) |
| | |
| | save/load(): |
| | - Save load the model |
| | """ |
| | def __init__(self, *, policy, ob_space, ac_space, nbatch_act, nbatch_train, |
| | nsteps, ent_coef, vf_coef, max_grad_norm, mpi_rank_weight=1, comm=None, microbatch_size=None): |
| | self.sess = sess = get_session() |
| |
|
| | if MPI is not None and comm is None: |
| | comm = MPI.COMM_WORLD |
| |
|
| | with tf.compat.v1.variable_scope('ppo2_model', reuse=tf.compat.v1.AUTO_REUSE): |
| | |
| | |
| | act_model = policy(nbatch_act, 1, sess) |
| |
|
| | |
| | if microbatch_size is None: |
| | train_model = policy(nbatch_train, nsteps, sess) |
| | else: |
| | train_model = policy(microbatch_size, nsteps, sess) |
| |
|
| | |
| | self.A = A = train_model.pdtype.sample_placeholder([None]) |
| | self.ADV = ADV = tf.compat.v1.placeholder(tf.float32, [None]) |
| | self.R = R = tf.compat.v1.placeholder(tf.float32, [None]) |
| | |
| | self.OLDNEGLOGPAC = OLDNEGLOGPAC = tf.compat.v1.placeholder(tf.float32, [None]) |
| | |
| | self.OLDVPRED = OLDVPRED = tf.compat.v1.placeholder(tf.float32, [None]) |
| | self.LR = LR = tf.compat.v1.placeholder(tf.float32, []) |
| | |
| | self.CLIPRANGE = CLIPRANGE = tf.compat.v1.placeholder(tf.float32, []) |
| |
|
| | neglogpac = train_model.pd.neglogp(A) |
| |
|
| | |
| | |
| | entropy = tf.reduce_mean(input_tensor=train_model.pd.entropy()) |
| |
|
| | |
| | |
| |
|
| | |
| | |
| | vpred = train_model.vf |
| | vpredclipped = OLDVPRED + tf.clip_by_value(train_model.vf - OLDVPRED, - CLIPRANGE, CLIPRANGE) |
| | |
| | vf_losses1 = tf.square(vpred - R) |
| | |
| | vf_losses2 = tf.square(vpredclipped - R) |
| |
|
| | vf_loss = .5 * tf.reduce_mean(input_tensor=tf.maximum(vf_losses1, vf_losses2)) |
| |
|
| | |
| | ratio = tf.exp(OLDNEGLOGPAC - neglogpac) |
| |
|
| | |
| | pg_losses = -ADV * ratio |
| |
|
| | pg_losses2 = -ADV * tf.clip_by_value(ratio, 1.0 - CLIPRANGE, 1.0 + CLIPRANGE) |
| |
|
| | |
| | pg_loss = tf.reduce_mean(input_tensor=tf.maximum(pg_losses, pg_losses2)) |
| | approxkl = .5 * tf.reduce_mean(input_tensor=tf.square(neglogpac - OLDNEGLOGPAC)) |
| | clipfrac = tf.reduce_mean(input_tensor=tf.cast(tf.greater(tf.abs(ratio - 1.0), CLIPRANGE), dtype=tf.float32)) |
| |
|
| | |
| | loss = pg_loss - entropy * ent_coef + vf_loss * vf_coef |
| |
|
| | |
| | |
| | params = tf.compat.v1.trainable_variables('ppo2_model') |
| | |
| | if comm is not None and comm.Get_size() > 1: |
| | self.trainer = MpiAdamOptimizer(comm, learning_rate=LR, mpi_rank_weight=mpi_rank_weight, epsilon=1e-5) |
| | else: |
| | self.trainer = tf.compat.v1.train.AdamOptimizer(learning_rate=LR, epsilon=1e-5) |
| | |
| | grads_and_var = self.trainer.compute_gradients(loss, params) |
| | grads, var = zip(*grads_and_var) |
| |
|
| | if max_grad_norm is not None: |
| | |
| | grads, _grad_norm = tf.clip_by_global_norm(grads, max_grad_norm) |
| | grads_and_var = list(zip(grads, var)) |
| | |
| | |
| |
|
| | self.grads = grads |
| | self.var = var |
| | self._train_op = self.trainer.apply_gradients(grads_and_var) |
| | self.loss_names = ['policy_loss', 'value_loss', 'policy_entropy', 'approxkl', 'clipfrac'] |
| | self.stats_list = [pg_loss, vf_loss, entropy, approxkl, clipfrac] |
| |
|
| |
|
| | self.train_model = train_model |
| | self.act_model = act_model |
| | self.step = act_model.step |
| | self.value = act_model.value |
| | self.initial_state = act_model.initial_state |
| |
|
| | self.save = functools.partial(save_variables, sess=sess) |
| | self.load = functools.partial(load_variables, sess=sess) |
| |
|
| | initialize() |
| | global_variables = tf.compat.v1.get_collection(tf.compat.v1.GraphKeys.GLOBAL_VARIABLES, scope="") |
| | if MPI is not None: |
| | sync_from_root(sess, global_variables, comm=comm) |
| |
|
| | def train(self, lr, cliprange, obs, returns, masks, actions, values, neglogpacs, states=None): |
| | |
| | |
| | advs = returns - values |
| |
|
| | |
| | advs = (advs - advs.mean()) / (advs.std() + 1e-8) |
| |
|
| | td_map = { |
| | self.train_model.X : obs, |
| | self.A : actions, |
| | self.ADV : advs, |
| | self.R : returns, |
| | self.LR : lr, |
| | self.CLIPRANGE : cliprange, |
| | self.OLDNEGLOGPAC : neglogpacs, |
| | self.OLDVPRED : values |
| | } |
| | if states is not None: |
| | td_map[self.train_model.S] = states |
| | td_map[self.train_model.M] = masks |
| |
|
| | return self.sess.run( |
| | self.stats_list + [self._train_op], |
| | td_map |
| | )[:-1] |
| |
|
| |
|