Upload 254 files

5960497 verified 4 months ago

6.25 kB

	import tensorflow as tf
	import functools

	from baselines.common.tf_util import get_session, save_variables, load_variables
	from baselines.common.tf_util import initialize

	try:
	from baselines.common.mpi_adam_optimizer import MpiAdamOptimizer
	from mpi4py import MPI
	from baselines.common.mpi_util import sync_from_root
	except ImportError:
	MPI = None

	class Model(object):
	"""
	We use this object to :
	__init__:
	- Creates the step_model
	- Creates the train_model

	train():
	- Make the training part (feedforward and retropropagation of gradients)

	save/load():
	- Save load the model
	"""
	def __init__(self, *, policy, ob_space, ac_space, nbatch_act, nbatch_train,
	nsteps, ent_coef, vf_coef, max_grad_norm, mpi_rank_weight=1, comm=None, microbatch_size=None):
	self.sess = sess = get_session()

	if MPI is not None and comm is None:
	comm = MPI.COMM_WORLD

	with tf.compat.v1.variable_scope('ppo2_model', reuse=tf.compat.v1.AUTO_REUSE):
	# CREATE OUR TWO MODELS
	# act_model that is used for sampling
	act_model = policy(nbatch_act, 1, sess)

	# Train model for training
	if microbatch_size is None:
	train_model = policy(nbatch_train, nsteps, sess)
	else:
	train_model = policy(microbatch_size, nsteps, sess)

	# CREATE THE PLACEHOLDERS
	self.A = A = train_model.pdtype.sample_placeholder([None])
	self.ADV = ADV = tf.compat.v1.placeholder(tf.float32, [None])
	self.R = R = tf.compat.v1.placeholder(tf.float32, [None])
	# Keep track of old actor
	self.OLDNEGLOGPAC = OLDNEGLOGPAC = tf.compat.v1.placeholder(tf.float32, [None])
	# Keep track of old critic
	self.OLDVPRED = OLDVPRED = tf.compat.v1.placeholder(tf.float32, [None])
	self.LR = LR = tf.compat.v1.placeholder(tf.float32, [])
	# Cliprange
	self.CLIPRANGE = CLIPRANGE = tf.compat.v1.placeholder(tf.float32, [])

	neglogpac = train_model.pd.neglogp(A)

	# Calculate the entropy
	# Entropy is used to improve exploration by limiting the premature convergence to suboptimal policy.
	entropy = tf.reduce_mean(input_tensor=train_model.pd.entropy())

	# CALCULATE THE LOSS
	# Total loss = Policy gradient loss - entropy * entropy coefficient + Value coefficient * value loss

	# Clip the value to reduce variability during Critic training
	# Get the predicted value
	vpred = train_model.vf
	vpredclipped = OLDVPRED + tf.clip_by_value(train_model.vf - OLDVPRED, - CLIPRANGE, CLIPRANGE)
	# Unclipped value
	vf_losses1 = tf.square(vpred - R)
	# Clipped value
	vf_losses2 = tf.square(vpredclipped - R)

	vf_loss = .5 * tf.reduce_mean(input_tensor=tf.maximum(vf_losses1, vf_losses2))

	# Calculate ratio (pi current policy / pi old policy)
	ratio = tf.exp(OLDNEGLOGPAC - neglogpac)

	# Defining Loss = - J is equivalent to max J
	pg_losses = -ADV * ratio

	pg_losses2 = -ADV * tf.clip_by_value(ratio, 1.0 - CLIPRANGE, 1.0 + CLIPRANGE)

	# Final PG loss
	pg_loss = tf.reduce_mean(input_tensor=tf.maximum(pg_losses, pg_losses2))
	approxkl = .5 * tf.reduce_mean(input_tensor=tf.square(neglogpac - OLDNEGLOGPAC))
	clipfrac = tf.reduce_mean(input_tensor=tf.cast(tf.greater(tf.abs(ratio - 1.0), CLIPRANGE), dtype=tf.float32))

	# Total loss
	loss = pg_loss - entropy * ent_coef + vf_loss * vf_coef

	# UPDATE THE PARAMETERS USING LOSS
	# 1. Get the model parameters
	params = tf.compat.v1.trainable_variables('ppo2_model')
	# 2. Build our trainer
	if comm is not None and comm.Get_size() > 1:
	self.trainer = MpiAdamOptimizer(comm, learning_rate=LR, mpi_rank_weight=mpi_rank_weight, epsilon=1e-5)
	else:
	self.trainer = tf.compat.v1.train.AdamOptimizer(learning_rate=LR, epsilon=1e-5)
	# 3. Calculate the gradients
	grads_and_var = self.trainer.compute_gradients(loss, params)
	grads, var = zip(*grads_and_var)

	if max_grad_norm is not None:
	# Clip the gradients (normalize)
	grads, _grad_norm = tf.clip_by_global_norm(grads, max_grad_norm)
	grads_and_var = list(zip(grads, var))
	# zip aggregate each gradient with parameters associated
	# For instance zip(ABCD, xyza) => Ax, By, Cz, Da

	self.grads = grads
	self.var = var
	self._train_op = self.trainer.apply_gradients(grads_and_var)
	self.loss_names = ['policy_loss', 'value_loss', 'policy_entropy', 'approxkl', 'clipfrac']
	self.stats_list = [pg_loss, vf_loss, entropy, approxkl, clipfrac]


	self.train_model = train_model
	self.act_model = act_model
	self.step = act_model.step
	self.value = act_model.value
	self.initial_state = act_model.initial_state

	self.save = functools.partial(save_variables, sess=sess)
	self.load = functools.partial(load_variables, sess=sess)

	initialize()
	global_variables = tf.compat.v1.get_collection(tf.compat.v1.GraphKeys.GLOBAL_VARIABLES, scope="")
	if MPI is not None:
	sync_from_root(sess, global_variables, comm=comm) #pylint: disable=E1101

	def train(self, lr, cliprange, obs, returns, masks, actions, values, neglogpacs, states=None):
	# Here we calculate advantage A(s,a) = R + yV(s') - V(s)
	# Returns = R + yV(s')
	advs = returns - values

	# Normalize the advantages
	advs = (advs - advs.mean()) / (advs.std() + 1e-8)

	td_map = {
	self.train_model.X : obs,
	self.A : actions,
	self.ADV : advs,
	self.R : returns,
	self.LR : lr,
	self.CLIPRANGE : cliprange,
	self.OLDNEGLOGPAC : neglogpacs,
	self.OLDVPRED : values
	}
	if states is not None:
	td_map[self.train_model.S] = states
	td_map[self.train_model.M] = masks

	return self.sess.run(
	self.stats_list + [self._train_op],
	td_map
	)[:-1]