Upload 254 files

5960497 verified 4 months ago

21.3 kB

	"""Deep Q learning graph

	The functions in this file can are used to create the following functions:

	======= act ========

	Function to chose an action given an observation

	Parameters
	----------
	observation: object
	Observation that can be feed into the output of make_obs_ph
	stochastic: bool
	if set to False all the actions are always deterministic (default False)
	update_eps_ph: float
	update epsilon a new value, if negative no update happens
	(default: no update)

	Returns
	-------
	Tensor of dtype tf.int64 and shape (BATCH_SIZE,) with an action to be performed for
	every element of the batch.


	======= act (in case of parameter noise) ========

	Function to chose an action given an observation

	Parameters
	----------
	observation: object
	Observation that can be feed into the output of make_obs_ph
	stochastic: bool
	if set to False all the actions are always deterministic (default False)
	update_eps_ph: float
	update epsilon to a new value, if negative no update happens
	(default: no update)
	reset_ph: bool
	reset the perturbed policy by sampling a new perturbation
	update_param_noise_threshold_ph: float
	the desired threshold for the difference between non-perturbed and perturbed policy
	update_param_noise_scale_ph: bool
	whether or not to update the scale of the noise for the next time it is re-perturbed

	Returns
	-------
	Tensor of dtype tf.int64 and shape (BATCH_SIZE,) with an action to be performed for
	every element of the batch.


	======= train =======

	Function that takes a transition (s,a,r,s') and optimizes Bellman equation's error:

	td_error = Q(s,a) - (r + gamma * max_a' Q(s', a'))
	loss = huber_loss[td_error]

	Parameters
	----------
	obs_t: object
	a batch of observations
	action: np.array
	actions that were selected upon seeing obs_t.
	dtype must be int32 and shape must be (batch_size,)
	reward: np.array
	immediate reward attained after executing those actions
	dtype must be float32 and shape must be (batch_size,)
	obs_tp1: object
	observations that followed obs_t
	done: np.array
	1 if obs_t was the last observation in the episode and 0 otherwise
	obs_tp1 gets ignored, but must be of the valid shape.
	dtype must be float32 and shape must be (batch_size,)
	weight: np.array
	imporance weights for every element of the batch (gradient is multiplied
	by the importance weight) dtype must be float32 and shape must be (batch_size,)

	Returns
	-------
	td_error: np.array
	a list of differences between Q(s,a) and the target in Bellman's equation.
	dtype is float32 and shape is (batch_size,)

	======= update_target ========

	copy the parameters from optimized Q function to the target Q function.
	In Q learning we actually optimize the following error:

	Q(s,a) - (r + gamma * max_a' Q'(s', a'))

	Where Q' is lagging behind Q to stablize the learning. For example for Atari

	Q' is set to Q once every 10000 updates training steps.

	"""
	import tensorflow as tf
	import baselines.common.tf_util as U


	def scope_vars(scope, trainable_only=False):
	"""
	Get variables inside a scope
	The scope can be specified as a string
	Parameters
	----------
	scope: str or VariableScope
	scope in which the variables reside.
	trainable_only: bool
	whether or not to return only the variables that were marked as trainable.
	Returns
	-------
	vars: [tf.Variable]
	list of variables in `scope`.
	"""
	return tf.compat.v1.get_collection(
	tf.compat.v1.GraphKeys.TRAINABLE_VARIABLES if trainable_only else tf.compat.v1.GraphKeys.GLOBAL_VARIABLES,
	scope=scope if isinstance(scope, str) else scope.name
	)


	def scope_name():
	"""Returns the name of current scope as a string, e.g. deepq/q_func"""
	return tf.compat.v1.get_variable_scope().name


	def absolute_scope_name(relative_scope_name):
	"""Appends parent scope name to `relative_scope_name`"""
	return scope_name() + "/" + relative_scope_name


	def default_param_noise_filter(var):
	if var not in tf.compat.v1.trainable_variables():
	# We never perturb non-trainable vars.
	return False
	if "fully_connected" in var.name:
	# We perturb fully-connected layers.
	return True

	# The remaining layers are likely conv or layer norm layers, which we do not wish to
	# perturb (in the former case because they only extract features, in the latter case because
	# we use them for normalization purposes). If you change your network, you will likely want
	# to re-consider which layers to perturb and which to keep untouched.
	return False


	def build_act(make_obs_ph, q_func, num_actions, scope="deepq", reuse=None):
	"""Creates the act function:

	Parameters
	----------
	make_obs_ph: str -> tf.placeholder or TfInput
	a function that take a name and creates a placeholder of input with that name
	q_func: (tf.Variable, int, str, bool) -> tf.Variable
	the model that takes the following inputs:
	observation_in: object
	the output of observation placeholder
	num_actions: int
	number of actions
	scope: str
	reuse: bool
	should be passed to outer variable scope
	and returns a tensor of shape (batch_size, num_actions) with values of every action.
	num_actions: int
	number of actions.
	scope: str or VariableScope
	optional scope for variable_scope.
	reuse: bool or None
	whether or not the variables should be reused. To be able to reuse the scope must be given.

	Returns
	-------
	act: (tf.Variable, bool, float) -> tf.Variable
	function to select and action given observation.
	` See the top of the file for details.
	"""
	with tf.compat.v1.variable_scope(scope, reuse=reuse):
	observations_ph = make_obs_ph("observation")
	stochastic_ph = tf.compat.v1.placeholder(tf.bool, (), name="stochastic")
	update_eps_ph = tf.compat.v1.placeholder(tf.float32, (), name="update_eps")

	eps = tf.compat.v1.get_variable("eps", (), initializer=tf.compat.v1.constant_initializer(0))

	q_values = q_func(observations_ph.get(), num_actions, scope="q_func")
	deterministic_actions = tf.argmax(input=q_values, axis=1)

	batch_size = tf.shape(input=observations_ph.get())[0]
	random_actions = tf.random.uniform(tf.stack([batch_size]), minval=0, maxval=num_actions, dtype=tf.int64)
	chose_random = tf.random.uniform(tf.stack([batch_size]), minval=0, maxval=1, dtype=tf.float32) < eps
	stochastic_actions = tf.compat.v1.where(chose_random, random_actions, deterministic_actions)

	output_actions = tf.cond(pred=stochastic_ph, true_fn=lambda: stochastic_actions, false_fn=lambda: deterministic_actions)
	update_eps_expr = eps.assign(tf.cond(pred=update_eps_ph >= 0, true_fn=lambda: update_eps_ph, false_fn=lambda: eps))
	_act = U.function(inputs=[observations_ph, stochastic_ph, update_eps_ph],
	outputs=output_actions,
	givens={update_eps_ph: -1.0, stochastic_ph: True},
	updates=[update_eps_expr])
	def act(ob, stochastic=True, update_eps=-1):
	return _act(ob, stochastic, update_eps)
	return act


	def build_act_with_param_noise(make_obs_ph, q_func, num_actions, scope="deepq", reuse=None, param_noise_filter_func=None):
	"""Creates the act function with support for parameter space noise exploration (https://arxiv.org/abs/1706.01905):

	Parameters
	----------
	make_obs_ph: str -> tf.placeholder or TfInput
	a function that take a name and creates a placeholder of input with that name
	q_func: (tf.Variable, int, str, bool) -> tf.Variable
	the model that takes the following inputs:
	observation_in: object
	the output of observation placeholder
	num_actions: int
	number of actions
	scope: str
	reuse: bool
	should be passed to outer variable scope
	and returns a tensor of shape (batch_size, num_actions) with values of every action.
	num_actions: int
	number of actions.
	scope: str or VariableScope
	optional scope for variable_scope.
	reuse: bool or None
	whether or not the variables should be reused. To be able to reuse the scope must be given.
	param_noise_filter_func: tf.Variable -> bool
	function that decides whether or not a variable should be perturbed. Only applicable
	if param_noise is True. If set to None, default_param_noise_filter is used by default.

	Returns
	-------
	act: (tf.Variable, bool, float, bool, float, bool) -> tf.Variable
	function to select and action given observation.
	` See the top of the file for details.
	"""
	if param_noise_filter_func is None:
	param_noise_filter_func = default_param_noise_filter

	with tf.compat.v1.variable_scope(scope, reuse=reuse):
	observations_ph = make_obs_ph("observation")
	stochastic_ph = tf.compat.v1.placeholder(tf.bool, (), name="stochastic")
	update_eps_ph = tf.compat.v1.placeholder(tf.float32, (), name="update_eps")
	update_param_noise_threshold_ph = tf.compat.v1.placeholder(tf.float32, (), name="update_param_noise_threshold")
	update_param_noise_scale_ph = tf.compat.v1.placeholder(tf.bool, (), name="update_param_noise_scale")
	reset_ph = tf.compat.v1.placeholder(tf.bool, (), name="reset")

	eps = tf.compat.v1.get_variable("eps", (), initializer=tf.compat.v1.constant_initializer(0))
	param_noise_scale = tf.compat.v1.get_variable("param_noise_scale", (), initializer=tf.compat.v1.constant_initializer(0.01), trainable=False)
	param_noise_threshold = tf.compat.v1.get_variable("param_noise_threshold", (), initializer=tf.compat.v1.constant_initializer(0.05), trainable=False)

	# Unmodified Q.
	q_values = q_func(observations_ph.get(), num_actions, scope="q_func")

	# Perturbable Q used for the actual rollout.
	q_values_perturbed = q_func(observations_ph.get(), num_actions, scope="perturbed_q_func")
	# We have to wrap this code into a function due to the way tf.cond() works. See
	# https://stackoverflow.com/questions/37063952/confused-by-the-behavior-of-tf-cond for
	# a more detailed discussion.
	def perturb_vars(original_scope, perturbed_scope):
	all_vars = scope_vars(absolute_scope_name(original_scope))
	all_perturbed_vars = scope_vars(absolute_scope_name(perturbed_scope))
	assert len(all_vars) == len(all_perturbed_vars)
	perturb_ops = []
	for var, perturbed_var in zip(all_vars, all_perturbed_vars):
	if param_noise_filter_func(perturbed_var):
	# Perturb this variable.
	op = tf.compat.v1.assign(perturbed_var, var + tf.random.normal(shape=tf.shape(input=var), mean=0., stddev=param_noise_scale))
	else:
	# Do not perturb, just assign.
	op = tf.compat.v1.assign(perturbed_var, var)
	perturb_ops.append(op)
	assert len(perturb_ops) == len(all_vars)
	return tf.group(*perturb_ops)

	# Set up functionality to re-compute `param_noise_scale`. This perturbs yet another copy
	# of the network and measures the effect of that perturbation in action space. If the perturbation
	# is too big, reduce scale of perturbation, otherwise increase.
	q_values_adaptive = q_func(observations_ph.get(), num_actions, scope="adaptive_q_func")
	perturb_for_adaption = perturb_vars(original_scope="q_func", perturbed_scope="adaptive_q_func")
	kl = tf.reduce_sum(input_tensor=tf.nn.softmax(q_values) * (tf.math.log(tf.nn.softmax(q_values)) - tf.math.log(tf.nn.softmax(q_values_adaptive))), axis=-1)
	mean_kl = tf.reduce_mean(input_tensor=kl)
	def update_scale():
	with tf.control_dependencies([perturb_for_adaption]):
	update_scale_expr = tf.cond(pred=mean_kl < param_noise_threshold,
	true_fn=lambda: param_noise_scale.assign(param_noise_scale * 1.01),
	false_fn=lambda: param_noise_scale.assign(param_noise_scale / 1.01),
	)
	return update_scale_expr

	# Functionality to update the threshold for parameter space noise.
	update_param_noise_threshold_expr = param_noise_threshold.assign(tf.cond(pred=update_param_noise_threshold_ph >= 0,
	true_fn=lambda: update_param_noise_threshold_ph, false_fn=lambda: param_noise_threshold))

	# Put everything together.
	deterministic_actions = tf.argmax(input=q_values_perturbed, axis=1)
	batch_size = tf.shape(input=observations_ph.get())[0]
	random_actions = tf.random.uniform(tf.stack([batch_size]), minval=0, maxval=num_actions, dtype=tf.int64)
	chose_random = tf.random.uniform(tf.stack([batch_size]), minval=0, maxval=1, dtype=tf.float32) < eps
	stochastic_actions = tf.compat.v1.where(chose_random, random_actions, deterministic_actions)

	output_actions = tf.cond(pred=stochastic_ph, true_fn=lambda: stochastic_actions, false_fn=lambda: deterministic_actions)
	update_eps_expr = eps.assign(tf.cond(pred=update_eps_ph >= 0, true_fn=lambda: update_eps_ph, false_fn=lambda: eps))
	updates = [
	update_eps_expr,
	tf.cond(pred=reset_ph, true_fn=lambda: perturb_vars(original_scope="q_func", perturbed_scope="perturbed_q_func"), false_fn=lambda: tf.group(*[])),
	tf.cond(pred=update_param_noise_scale_ph, true_fn=lambda: update_scale(), false_fn=lambda: tf.Variable(0., trainable=False)),
	update_param_noise_threshold_expr,
	]
	_act = U.function(inputs=[observations_ph, stochastic_ph, update_eps_ph, reset_ph, update_param_noise_threshold_ph, update_param_noise_scale_ph],
	outputs=output_actions,
	givens={update_eps_ph: -1.0, stochastic_ph: True, reset_ph: False, update_param_noise_threshold_ph: False, update_param_noise_scale_ph: False},
	updates=updates)
	def act(ob, reset=False, update_param_noise_threshold=False, update_param_noise_scale=False, stochastic=True, update_eps=-1):
	return _act(ob, stochastic, update_eps, reset, update_param_noise_threshold, update_param_noise_scale)
	return act


	def build_train(make_obs_ph, q_func, num_actions, optimizer, grad_norm_clipping=None, gamma=1.0,
	double_q=True, scope="deepq", reuse=None, param_noise=False, param_noise_filter_func=None):
	"""Creates the train function:

	Parameters
	----------
	make_obs_ph: str -> tf.placeholder or TfInput
	a function that takes a name and creates a placeholder of input with that name
	q_func: (tf.Variable, int, str, bool) -> tf.Variable
	the model that takes the following inputs:
	observation_in: object
	the output of observation placeholder
	num_actions: int
	number of actions
	scope: str
	reuse: bool
	should be passed to outer variable scope
	and returns a tensor of shape (batch_size, num_actions) with values of every action.
	num_actions: int
	number of actions
	reuse: bool
	whether or not to reuse the graph variables
	optimizer: tf.train.Optimizer
	optimizer to use for the Q-learning objective.
	grad_norm_clipping: float or None
	clip gradient norms to this value. If None no clipping is performed.
	gamma: float
	discount rate.
	double_q: bool
	if true will use Double Q Learning (https://arxiv.org/abs/1509.06461).
	In general it is a good idea to keep it enabled.
	scope: str or VariableScope
	optional scope for variable_scope.
	reuse: bool or None
	whether or not the variables should be reused. To be able to reuse the scope must be given.
	param_noise: bool
	whether or not to use parameter space noise (https://arxiv.org/abs/1706.01905)
	param_noise_filter_func: tf.Variable -> bool
	function that decides whether or not a variable should be perturbed. Only applicable
	if param_noise is True. If set to None, default_param_noise_filter is used by default.

	Returns
	-------
	act: (tf.Variable, bool, float) -> tf.Variable
	function to select and action given observation.
	` See the top of the file for details.
	train: (object, np.array, np.array, object, np.array, np.array) -> np.array
	optimize the error in Bellman's equation.
	` See the top of the file for details.
	update_target: () -> ()
	copy the parameters from optimized Q function to the target Q function.
	` See the top of the file for details.
	debug: {str: function}
	a bunch of functions to print debug data like q_values.
	"""
	if param_noise:
	act_f = build_act_with_param_noise(make_obs_ph, q_func, num_actions, scope=scope, reuse=reuse,
	param_noise_filter_func=param_noise_filter_func)
	else:
	act_f = build_act(make_obs_ph, q_func, num_actions, scope=scope, reuse=reuse)

	with tf.compat.v1.variable_scope(scope, reuse=reuse):
	# set up placeholders
	obs_t_input = make_obs_ph("obs_t")
	act_t_ph = tf.compat.v1.placeholder(tf.int32, [None], name="action")
	rew_t_ph = tf.compat.v1.placeholder(tf.float32, [None], name="reward")
	obs_tp1_input = make_obs_ph("obs_tp1")
	done_mask_ph = tf.compat.v1.placeholder(tf.float32, [None], name="done")
	importance_weights_ph = tf.compat.v1.placeholder(tf.float32, [None], name="weight")

	# q network evaluation
	q_t = q_func(obs_t_input.get(), num_actions, scope="q_func", reuse=True) # reuse parameters from act
	q_func_vars = tf.compat.v1.get_collection(tf.compat.v1.GraphKeys.GLOBAL_VARIABLES, scope=tf.compat.v1.get_variable_scope().name + "/q_func")

	# target q network evalution
	q_tp1 = q_func(obs_tp1_input.get(), num_actions, scope="target_q_func")
	target_q_func_vars = tf.compat.v1.get_collection(tf.compat.v1.GraphKeys.GLOBAL_VARIABLES, scope=tf.compat.v1.get_variable_scope().name + "/target_q_func")

	# q scores for actions which we know were selected in the given state.
	q_t_selected = tf.reduce_sum(input_tensor=q_t * tf.one_hot(act_t_ph, num_actions), axis=1)

	# compute estimate of best possible value starting from state at t + 1
	if double_q:
	q_tp1_using_online_net = q_func(obs_tp1_input.get(), num_actions, scope="q_func", reuse=True)
	q_tp1_best_using_online_net = tf.argmax(input=q_tp1_using_online_net, axis=1)
	q_tp1_best = tf.reduce_sum(input_tensor=q_tp1 * tf.one_hot(q_tp1_best_using_online_net, num_actions), axis=1)
	else:
	q_tp1_best = tf.reduce_max(input_tensor=q_tp1, axis=1)
	q_tp1_best_masked = (1.0 - done_mask_ph) * q_tp1_best

	# compute RHS of bellman equation
	q_t_selected_target = rew_t_ph + gamma * q_tp1_best_masked

	# compute the error (potentially clipped)
	td_error = q_t_selected - tf.stop_gradient(q_t_selected_target)
	errors = U.huber_loss(td_error)
	weighted_error = tf.reduce_mean(input_tensor=importance_weights_ph * errors)

	# compute optimization op (potentially with gradient clipping)
	if grad_norm_clipping is not None:
	gradients = optimizer.compute_gradients(weighted_error, var_list=q_func_vars)
	for i, (grad, var) in enumerate(gradients):
	if grad is not None:
	gradients[i] = (tf.clip_by_norm(grad, grad_norm_clipping), var)
	optimize_expr = optimizer.apply_gradients(gradients)
	else:
	optimize_expr = optimizer.minimize(weighted_error, var_list=q_func_vars)

	# update_target_fn will be called periodically to copy Q network to target Q network
	update_target_expr = []
	for var, var_target in zip(sorted(q_func_vars, key=lambda v: v.name),
	sorted(target_q_func_vars, key=lambda v: v.name)):
	update_target_expr.append(var_target.assign(var))
	update_target_expr = tf.group(*update_target_expr)

	# Create callable functions
	train = U.function(
	inputs=[
	obs_t_input,
	act_t_ph,
	rew_t_ph,
	obs_tp1_input,
	done_mask_ph,
	importance_weights_ph
	],
	outputs=td_error,
	updates=[optimize_expr]
	)
	update_target = U.function([], [], updates=[update_target_expr])

	q_values = U.function([obs_t_input], q_t)

	return act_f, train, update_target, {'q_values': q_values}