leonepson's picture
Upload 254 files
5960497 verified
import tensorflow as tf
import functools
from baselines.common.tf_util import get_session, save_variables, load_variables
from baselines.common.tf_util import initialize
import numpy as np
from itertools import combinations
try:
from baselines.common.mpi_adam_optimizer import MpiAdamOptimizer
from mpi4py import MPI
from baselines.common.mpi_util import sync_from_root
except ImportError:
MPI = None
class Model(object):
"""
We use this object to :
__init__:
- Creates the step_model
- Creates the train_model
train():
- Make the training part (feedforward and retropropagation of gradients)
save/load():
- Save load the model
"""
def __init__(self, *, policy, ob_space, ac_space, nbatch_act, nbatch_train,
nsteps, ent_coef, vf_coef, skill_coef, vq_coef, max_grad_norm, alpha1=20, alpha2=1, beta=0.25,
Cd=5, mpi_rank_weight=1, comm=None,
microbatch_size=None):
self.sess = sess = get_session()
if MPI is not None and comm is None:
comm = MPI.COMM_WORLD
with tf.compat.v1.variable_scope('sppo2_model', reuse=tf.compat.v1.AUTO_REUSE):
# CREATE OUR TWO MODELS
# act_model that is used for sampling
act_model = policy(nbatch_act, 1, sess)
# Train model for training
if microbatch_size is None:
train_model = policy(nbatch_train, nsteps, sess)
else:
train_model = policy(microbatch_size, nsteps, sess)
# CREATE THE PLACEHOLDERS
self.A = A = train_model.pdtype.sample_placeholder([None])
self.ADV = ADV = tf.compat.v1.placeholder(tf.float32, [None])
self.R = R = tf.compat.v1.placeholder(tf.float32, [None])
self.skill_weight = skill_weight = tf.compat.v1.placeholder(tf.float32, [])
# Keep track of old actor
self.OLDNEGLOGPAC = OLDNEGLOGPAC = tf.compat.v1.placeholder(tf.float32, [None])
# Keep track of old critic
self.OLDVPRED = OLDVPRED = tf.compat.v1.placeholder(tf.float32, [None])
self.LR = LR = tf.compat.v1.placeholder(tf.float32, [])
# Cliprange
self.CLIPRANGE = CLIPRANGE = tf.compat.v1.placeholder(tf.float32, [])
self.train_skill = train_skill = tf.compat.v1.placeholder(tf.bool, [])
neglogpac = train_model.pd.neglogp(A)
# Calculate the entropy
# Entropy is used to improve exploration by limiting the premature convergence to suboptimal policy.
entropy = tf.reduce_mean(input_tensor=train_model.pd.entropy())
# CALCULATE THE LOSS
# Total loss = Policy gradient loss - entropy * entropy coefficient + Value coefficient * value loss
# Clip the value to reduce variability during Critic training
# Get the predicted value
vpred = train_model.vf
vpredclipped = OLDVPRED + tf.clip_by_value(train_model.vf - OLDVPRED, - CLIPRANGE, CLIPRANGE)
# Unclipped value
vf_losses1 = tf.square(vpred - R)
# Clipped value
vf_losses2 = tf.square(vpredclipped - R)
vf_loss = .5 * tf.reduce_mean(input_tensor=tf.maximum(vf_losses1, vf_losses2))
# Calculate ratio (pi current policy / pi old policy)
ratio = tf.exp(OLDNEGLOGPAC - neglogpac)
# Defining Loss = - J is equivalent to max J
pg_losses = -ADV * ratio
pg_losses2 = -ADV * tf.clip_by_value(ratio, 1.0 - CLIPRANGE, 1.0 + CLIPRANGE)
# calculating the skill loss
# To avoid the GPU memory exploded, please choose the proper chunk size for your GPUs
chunk_size = 64
def pairwise_sqd_distance(x):
if x.shape[-1] <= chunk_size:
tiled = tf.tile(tf.expand_dims(x, axis=1), tf.stack([1, x.shape[0], 1]))
tiled_trans = tf.transpose(tiled, perm=[1, 0, 2])
diffs = tiled - tiled_trans
sqd_dist_mat = tf.reduce_sum(tf.square(diffs), axis=2)
else:
sqd_dist_mat = []
for i in range(x.shape[-1] // chunk_size):
tiled = tf.tile(tf.expand_dims(x[:, i * chunk_size:i * chunk_size + chunk_size], axis=1),
tf.stack([1, x.shape[0], 1]))
tiled_trans = tf.transpose(tiled, perm=[1, 0, 2])
diffs = tiled - tiled_trans
if sqd_dist_mat != []:
sqd_dist_mat = tf.add(sqd_dist_mat, tf.reduce_sum(tf.square(diffs), axis=2))
else:
sqd_dist_mat = tf.reduce_sum(tf.square(diffs), axis=2)
return sqd_dist_mat
def make_q(z, alpha):
sqd_dist_mat = pairwise_sqd_distance(z)
q = tf.pow((1 + sqd_dist_mat / alpha), -(alpha + 1) / 2)
q = tf.linalg.set_diag(q, np.zeros(shape=[z.shape[0]]))
q = q / tf.reduce_sum(q, axis=0, keepdims=True)
q = tf.clip_by_value(q, 1e-10, 1.0)
return q
p = make_q(train_model.pure_latent, alpha=alpha1)
q = make_q(train_model.skill_latent, alpha=alpha2)
skill_losses = tf.reduce_sum(-(tf.multiply(p, tf.math.log(q)))) / (p.shape[0] * p.shape[1])
skill_loss_ = tf.reduce_mean(skill_losses)
# calculating the vq loss
if beta != 0:
commitment_loss = beta * tf.reduce_mean(
input_tensor=(tf.stop_gradient(train_model.pure_vq_latent) - train_model.skill_latent) ** 2, axis=1
)
else:
commitment_loss = 0
codebook_loss = tf.reduce_mean(
input_tensor=(train_model.pure_vq_latent - tf.stop_gradient(train_model.skill_latent)) ** 2, axis=1)
vq_losses = commitment_loss + codebook_loss
vq_loss_ = tf.reduce_mean(vq_losses)
# Final PG loss
pg_loss = tf.reduce_mean(input_tensor=tf.maximum(pg_losses, pg_losses2))
approxkl = .5 * tf.reduce_mean(input_tensor=tf.square(neglogpac - OLDNEGLOGPAC))
clipfrac = tf.reduce_mean(input_tensor=tf.cast(tf.greater(tf.abs(ratio - 1.0), CLIPRANGE), dtype=tf.float32))
# Total loss
loss = tf.cond(train_skill, lambda: pg_loss - entropy * ent_coef + vf_loss * vf_coef + skill_weight * (
skill_loss_ * skill_coef + vq_loss_ * vq_coef),
lambda: pg_loss - entropy * ent_coef + vf_loss * vf_coef)
# UPDATE THE PARAMETERS USING LOSS
# 1. Get the model parameters
params = tf.compat.v1.trainable_variables('sppo2_model')
# 2. Build our trainer
if comm is not None and comm.Get_size() > 1:
self.trainer = MpiAdamOptimizer(comm, learning_rate=LR, mpi_rank_weight=mpi_rank_weight, epsilon=1e-5)
else:
self.trainer = tf.compat.v1.train.AdamOptimizer(learning_rate=LR, epsilon=1e-5)
# 3. Calculate the gradients
grads_and_var = self.trainer.compute_gradients(loss, params)
grads, var = zip(*grads_and_var)
if max_grad_norm is not None:
# Clip the gradients (normalize)
grads, _grad_norm = tf.clip_by_global_norm(grads, max_grad_norm)
grads_and_var = list(zip(grads, var))
# zip aggregate each gradient with parameters associated
# For instance zip(ABCD, xyza) => Ax, By, Cz, Da
self.grads = grads
self.var = var
self._train_op = self.trainer.apply_gradients(grads_and_var)
self.loss_names = ['loss', 'policy_loss', 'value_loss', 'skill_loss', 'vq_loss',
'policy_entropy',
'approxkl',
'clipfrac']
self.stats_list = [loss, pg_loss, vf_loss, skill_loss_, vq_loss_, entropy, approxkl,
clipfrac]
self.train_model = train_model
self.act_model = act_model
self.step = act_model.step
self.skill_step = act_model.skill_step
self.value = act_model.value
self.initial_state = act_model.initial_state
self.save = functools.partial(save_variables, sess=sess)
self.load = functools.partial(load_variables, sess=sess)
initialize()
global_variables = tf.compat.v1.get_collection(tf.compat.v1.GraphKeys.GLOBAL_VARIABLES, scope="")
if MPI is not None:
sync_from_root(sess, global_variables, comm=comm) # pylint: disable=E1101
def train(self, lr, cliprange, train_skill, skill_weight, obs, returns,
masks, actions,
values,
neglogpacs,
states=None):
# Here we calculate advantage A(s,a) = R + yV(s') - V(s)
# Returns = R + yV(s')
advs = returns - values
# Normalize the advantages
advs = (advs - advs.mean()) / (advs.std() + 1e-8)
td_map = {
self.train_model.X: obs,
self.A: actions,
self.ADV: advs,
self.R: returns,
self.LR: lr,
self.train_skill: train_skill,
self.skill_weight: skill_weight,
self.CLIPRANGE: cliprange,
self.OLDNEGLOGPAC: neglogpacs,
self.OLDVPRED: values
}
if states is not None:
td_map[self.train_model.S] = states
td_map[self.train_model.M] = masks
return self.sess.run(
self.stats_list + [self._train_op],
td_map
)[:-1]