File size: 21,325 Bytes
5960497 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 | """Deep Q learning graph
The functions in this file can are used to create the following functions:
======= act ========
Function to chose an action given an observation
Parameters
----------
observation: object
Observation that can be feed into the output of make_obs_ph
stochastic: bool
if set to False all the actions are always deterministic (default False)
update_eps_ph: float
update epsilon a new value, if negative no update happens
(default: no update)
Returns
-------
Tensor of dtype tf.int64 and shape (BATCH_SIZE,) with an action to be performed for
every element of the batch.
======= act (in case of parameter noise) ========
Function to chose an action given an observation
Parameters
----------
observation: object
Observation that can be feed into the output of make_obs_ph
stochastic: bool
if set to False all the actions are always deterministic (default False)
update_eps_ph: float
update epsilon to a new value, if negative no update happens
(default: no update)
reset_ph: bool
reset the perturbed policy by sampling a new perturbation
update_param_noise_threshold_ph: float
the desired threshold for the difference between non-perturbed and perturbed policy
update_param_noise_scale_ph: bool
whether or not to update the scale of the noise for the next time it is re-perturbed
Returns
-------
Tensor of dtype tf.int64 and shape (BATCH_SIZE,) with an action to be performed for
every element of the batch.
======= train =======
Function that takes a transition (s,a,r,s') and optimizes Bellman equation's error:
td_error = Q(s,a) - (r + gamma * max_a' Q(s', a'))
loss = huber_loss[td_error]
Parameters
----------
obs_t: object
a batch of observations
action: np.array
actions that were selected upon seeing obs_t.
dtype must be int32 and shape must be (batch_size,)
reward: np.array
immediate reward attained after executing those actions
dtype must be float32 and shape must be (batch_size,)
obs_tp1: object
observations that followed obs_t
done: np.array
1 if obs_t was the last observation in the episode and 0 otherwise
obs_tp1 gets ignored, but must be of the valid shape.
dtype must be float32 and shape must be (batch_size,)
weight: np.array
imporance weights for every element of the batch (gradient is multiplied
by the importance weight) dtype must be float32 and shape must be (batch_size,)
Returns
-------
td_error: np.array
a list of differences between Q(s,a) and the target in Bellman's equation.
dtype is float32 and shape is (batch_size,)
======= update_target ========
copy the parameters from optimized Q function to the target Q function.
In Q learning we actually optimize the following error:
Q(s,a) - (r + gamma * max_a' Q'(s', a'))
Where Q' is lagging behind Q to stablize the learning. For example for Atari
Q' is set to Q once every 10000 updates training steps.
"""
import tensorflow as tf
import baselines.common.tf_util as U
def scope_vars(scope, trainable_only=False):
"""
Get variables inside a scope
The scope can be specified as a string
Parameters
----------
scope: str or VariableScope
scope in which the variables reside.
trainable_only: bool
whether or not to return only the variables that were marked as trainable.
Returns
-------
vars: [tf.Variable]
list of variables in `scope`.
"""
return tf.compat.v1.get_collection(
tf.compat.v1.GraphKeys.TRAINABLE_VARIABLES if trainable_only else tf.compat.v1.GraphKeys.GLOBAL_VARIABLES,
scope=scope if isinstance(scope, str) else scope.name
)
def scope_name():
"""Returns the name of current scope as a string, e.g. deepq/q_func"""
return tf.compat.v1.get_variable_scope().name
def absolute_scope_name(relative_scope_name):
"""Appends parent scope name to `relative_scope_name`"""
return scope_name() + "/" + relative_scope_name
def default_param_noise_filter(var):
if var not in tf.compat.v1.trainable_variables():
# We never perturb non-trainable vars.
return False
if "fully_connected" in var.name:
# We perturb fully-connected layers.
return True
# The remaining layers are likely conv or layer norm layers, which we do not wish to
# perturb (in the former case because they only extract features, in the latter case because
# we use them for normalization purposes). If you change your network, you will likely want
# to re-consider which layers to perturb and which to keep untouched.
return False
def build_act(make_obs_ph, q_func, num_actions, scope="deepq", reuse=None):
"""Creates the act function:
Parameters
----------
make_obs_ph: str -> tf.placeholder or TfInput
a function that take a name and creates a placeholder of input with that name
q_func: (tf.Variable, int, str, bool) -> tf.Variable
the model that takes the following inputs:
observation_in: object
the output of observation placeholder
num_actions: int
number of actions
scope: str
reuse: bool
should be passed to outer variable scope
and returns a tensor of shape (batch_size, num_actions) with values of every action.
num_actions: int
number of actions.
scope: str or VariableScope
optional scope for variable_scope.
reuse: bool or None
whether or not the variables should be reused. To be able to reuse the scope must be given.
Returns
-------
act: (tf.Variable, bool, float) -> tf.Variable
function to select and action given observation.
` See the top of the file for details.
"""
with tf.compat.v1.variable_scope(scope, reuse=reuse):
observations_ph = make_obs_ph("observation")
stochastic_ph = tf.compat.v1.placeholder(tf.bool, (), name="stochastic")
update_eps_ph = tf.compat.v1.placeholder(tf.float32, (), name="update_eps")
eps = tf.compat.v1.get_variable("eps", (), initializer=tf.compat.v1.constant_initializer(0))
q_values = q_func(observations_ph.get(), num_actions, scope="q_func")
deterministic_actions = tf.argmax(input=q_values, axis=1)
batch_size = tf.shape(input=observations_ph.get())[0]
random_actions = tf.random.uniform(tf.stack([batch_size]), minval=0, maxval=num_actions, dtype=tf.int64)
chose_random = tf.random.uniform(tf.stack([batch_size]), minval=0, maxval=1, dtype=tf.float32) < eps
stochastic_actions = tf.compat.v1.where(chose_random, random_actions, deterministic_actions)
output_actions = tf.cond(pred=stochastic_ph, true_fn=lambda: stochastic_actions, false_fn=lambda: deterministic_actions)
update_eps_expr = eps.assign(tf.cond(pred=update_eps_ph >= 0, true_fn=lambda: update_eps_ph, false_fn=lambda: eps))
_act = U.function(inputs=[observations_ph, stochastic_ph, update_eps_ph],
outputs=output_actions,
givens={update_eps_ph: -1.0, stochastic_ph: True},
updates=[update_eps_expr])
def act(ob, stochastic=True, update_eps=-1):
return _act(ob, stochastic, update_eps)
return act
def build_act_with_param_noise(make_obs_ph, q_func, num_actions, scope="deepq", reuse=None, param_noise_filter_func=None):
"""Creates the act function with support for parameter space noise exploration (https://arxiv.org/abs/1706.01905):
Parameters
----------
make_obs_ph: str -> tf.placeholder or TfInput
a function that take a name and creates a placeholder of input with that name
q_func: (tf.Variable, int, str, bool) -> tf.Variable
the model that takes the following inputs:
observation_in: object
the output of observation placeholder
num_actions: int
number of actions
scope: str
reuse: bool
should be passed to outer variable scope
and returns a tensor of shape (batch_size, num_actions) with values of every action.
num_actions: int
number of actions.
scope: str or VariableScope
optional scope for variable_scope.
reuse: bool or None
whether or not the variables should be reused. To be able to reuse the scope must be given.
param_noise_filter_func: tf.Variable -> bool
function that decides whether or not a variable should be perturbed. Only applicable
if param_noise is True. If set to None, default_param_noise_filter is used by default.
Returns
-------
act: (tf.Variable, bool, float, bool, float, bool) -> tf.Variable
function to select and action given observation.
` See the top of the file for details.
"""
if param_noise_filter_func is None:
param_noise_filter_func = default_param_noise_filter
with tf.compat.v1.variable_scope(scope, reuse=reuse):
observations_ph = make_obs_ph("observation")
stochastic_ph = tf.compat.v1.placeholder(tf.bool, (), name="stochastic")
update_eps_ph = tf.compat.v1.placeholder(tf.float32, (), name="update_eps")
update_param_noise_threshold_ph = tf.compat.v1.placeholder(tf.float32, (), name="update_param_noise_threshold")
update_param_noise_scale_ph = tf.compat.v1.placeholder(tf.bool, (), name="update_param_noise_scale")
reset_ph = tf.compat.v1.placeholder(tf.bool, (), name="reset")
eps = tf.compat.v1.get_variable("eps", (), initializer=tf.compat.v1.constant_initializer(0))
param_noise_scale = tf.compat.v1.get_variable("param_noise_scale", (), initializer=tf.compat.v1.constant_initializer(0.01), trainable=False)
param_noise_threshold = tf.compat.v1.get_variable("param_noise_threshold", (), initializer=tf.compat.v1.constant_initializer(0.05), trainable=False)
# Unmodified Q.
q_values = q_func(observations_ph.get(), num_actions, scope="q_func")
# Perturbable Q used for the actual rollout.
q_values_perturbed = q_func(observations_ph.get(), num_actions, scope="perturbed_q_func")
# We have to wrap this code into a function due to the way tf.cond() works. See
# https://stackoverflow.com/questions/37063952/confused-by-the-behavior-of-tf-cond for
# a more detailed discussion.
def perturb_vars(original_scope, perturbed_scope):
all_vars = scope_vars(absolute_scope_name(original_scope))
all_perturbed_vars = scope_vars(absolute_scope_name(perturbed_scope))
assert len(all_vars) == len(all_perturbed_vars)
perturb_ops = []
for var, perturbed_var in zip(all_vars, all_perturbed_vars):
if param_noise_filter_func(perturbed_var):
# Perturb this variable.
op = tf.compat.v1.assign(perturbed_var, var + tf.random.normal(shape=tf.shape(input=var), mean=0., stddev=param_noise_scale))
else:
# Do not perturb, just assign.
op = tf.compat.v1.assign(perturbed_var, var)
perturb_ops.append(op)
assert len(perturb_ops) == len(all_vars)
return tf.group(*perturb_ops)
# Set up functionality to re-compute `param_noise_scale`. This perturbs yet another copy
# of the network and measures the effect of that perturbation in action space. If the perturbation
# is too big, reduce scale of perturbation, otherwise increase.
q_values_adaptive = q_func(observations_ph.get(), num_actions, scope="adaptive_q_func")
perturb_for_adaption = perturb_vars(original_scope="q_func", perturbed_scope="adaptive_q_func")
kl = tf.reduce_sum(input_tensor=tf.nn.softmax(q_values) * (tf.math.log(tf.nn.softmax(q_values)) - tf.math.log(tf.nn.softmax(q_values_adaptive))), axis=-1)
mean_kl = tf.reduce_mean(input_tensor=kl)
def update_scale():
with tf.control_dependencies([perturb_for_adaption]):
update_scale_expr = tf.cond(pred=mean_kl < param_noise_threshold,
true_fn=lambda: param_noise_scale.assign(param_noise_scale * 1.01),
false_fn=lambda: param_noise_scale.assign(param_noise_scale / 1.01),
)
return update_scale_expr
# Functionality to update the threshold for parameter space noise.
update_param_noise_threshold_expr = param_noise_threshold.assign(tf.cond(pred=update_param_noise_threshold_ph >= 0,
true_fn=lambda: update_param_noise_threshold_ph, false_fn=lambda: param_noise_threshold))
# Put everything together.
deterministic_actions = tf.argmax(input=q_values_perturbed, axis=1)
batch_size = tf.shape(input=observations_ph.get())[0]
random_actions = tf.random.uniform(tf.stack([batch_size]), minval=0, maxval=num_actions, dtype=tf.int64)
chose_random = tf.random.uniform(tf.stack([batch_size]), minval=0, maxval=1, dtype=tf.float32) < eps
stochastic_actions = tf.compat.v1.where(chose_random, random_actions, deterministic_actions)
output_actions = tf.cond(pred=stochastic_ph, true_fn=lambda: stochastic_actions, false_fn=lambda: deterministic_actions)
update_eps_expr = eps.assign(tf.cond(pred=update_eps_ph >= 0, true_fn=lambda: update_eps_ph, false_fn=lambda: eps))
updates = [
update_eps_expr,
tf.cond(pred=reset_ph, true_fn=lambda: perturb_vars(original_scope="q_func", perturbed_scope="perturbed_q_func"), false_fn=lambda: tf.group(*[])),
tf.cond(pred=update_param_noise_scale_ph, true_fn=lambda: update_scale(), false_fn=lambda: tf.Variable(0., trainable=False)),
update_param_noise_threshold_expr,
]
_act = U.function(inputs=[observations_ph, stochastic_ph, update_eps_ph, reset_ph, update_param_noise_threshold_ph, update_param_noise_scale_ph],
outputs=output_actions,
givens={update_eps_ph: -1.0, stochastic_ph: True, reset_ph: False, update_param_noise_threshold_ph: False, update_param_noise_scale_ph: False},
updates=updates)
def act(ob, reset=False, update_param_noise_threshold=False, update_param_noise_scale=False, stochastic=True, update_eps=-1):
return _act(ob, stochastic, update_eps, reset, update_param_noise_threshold, update_param_noise_scale)
return act
def build_train(make_obs_ph, q_func, num_actions, optimizer, grad_norm_clipping=None, gamma=1.0,
double_q=True, scope="deepq", reuse=None, param_noise=False, param_noise_filter_func=None):
"""Creates the train function:
Parameters
----------
make_obs_ph: str -> tf.placeholder or TfInput
a function that takes a name and creates a placeholder of input with that name
q_func: (tf.Variable, int, str, bool) -> tf.Variable
the model that takes the following inputs:
observation_in: object
the output of observation placeholder
num_actions: int
number of actions
scope: str
reuse: bool
should be passed to outer variable scope
and returns a tensor of shape (batch_size, num_actions) with values of every action.
num_actions: int
number of actions
reuse: bool
whether or not to reuse the graph variables
optimizer: tf.train.Optimizer
optimizer to use for the Q-learning objective.
grad_norm_clipping: float or None
clip gradient norms to this value. If None no clipping is performed.
gamma: float
discount rate.
double_q: bool
if true will use Double Q Learning (https://arxiv.org/abs/1509.06461).
In general it is a good idea to keep it enabled.
scope: str or VariableScope
optional scope for variable_scope.
reuse: bool or None
whether or not the variables should be reused. To be able to reuse the scope must be given.
param_noise: bool
whether or not to use parameter space noise (https://arxiv.org/abs/1706.01905)
param_noise_filter_func: tf.Variable -> bool
function that decides whether or not a variable should be perturbed. Only applicable
if param_noise is True. If set to None, default_param_noise_filter is used by default.
Returns
-------
act: (tf.Variable, bool, float) -> tf.Variable
function to select and action given observation.
` See the top of the file for details.
train: (object, np.array, np.array, object, np.array, np.array) -> np.array
optimize the error in Bellman's equation.
` See the top of the file for details.
update_target: () -> ()
copy the parameters from optimized Q function to the target Q function.
` See the top of the file for details.
debug: {str: function}
a bunch of functions to print debug data like q_values.
"""
if param_noise:
act_f = build_act_with_param_noise(make_obs_ph, q_func, num_actions, scope=scope, reuse=reuse,
param_noise_filter_func=param_noise_filter_func)
else:
act_f = build_act(make_obs_ph, q_func, num_actions, scope=scope, reuse=reuse)
with tf.compat.v1.variable_scope(scope, reuse=reuse):
# set up placeholders
obs_t_input = make_obs_ph("obs_t")
act_t_ph = tf.compat.v1.placeholder(tf.int32, [None], name="action")
rew_t_ph = tf.compat.v1.placeholder(tf.float32, [None], name="reward")
obs_tp1_input = make_obs_ph("obs_tp1")
done_mask_ph = tf.compat.v1.placeholder(tf.float32, [None], name="done")
importance_weights_ph = tf.compat.v1.placeholder(tf.float32, [None], name="weight")
# q network evaluation
q_t = q_func(obs_t_input.get(), num_actions, scope="q_func", reuse=True) # reuse parameters from act
q_func_vars = tf.compat.v1.get_collection(tf.compat.v1.GraphKeys.GLOBAL_VARIABLES, scope=tf.compat.v1.get_variable_scope().name + "/q_func")
# target q network evalution
q_tp1 = q_func(obs_tp1_input.get(), num_actions, scope="target_q_func")
target_q_func_vars = tf.compat.v1.get_collection(tf.compat.v1.GraphKeys.GLOBAL_VARIABLES, scope=tf.compat.v1.get_variable_scope().name + "/target_q_func")
# q scores for actions which we know were selected in the given state.
q_t_selected = tf.reduce_sum(input_tensor=q_t * tf.one_hot(act_t_ph, num_actions), axis=1)
# compute estimate of best possible value starting from state at t + 1
if double_q:
q_tp1_using_online_net = q_func(obs_tp1_input.get(), num_actions, scope="q_func", reuse=True)
q_tp1_best_using_online_net = tf.argmax(input=q_tp1_using_online_net, axis=1)
q_tp1_best = tf.reduce_sum(input_tensor=q_tp1 * tf.one_hot(q_tp1_best_using_online_net, num_actions), axis=1)
else:
q_tp1_best = tf.reduce_max(input_tensor=q_tp1, axis=1)
q_tp1_best_masked = (1.0 - done_mask_ph) * q_tp1_best
# compute RHS of bellman equation
q_t_selected_target = rew_t_ph + gamma * q_tp1_best_masked
# compute the error (potentially clipped)
td_error = q_t_selected - tf.stop_gradient(q_t_selected_target)
errors = U.huber_loss(td_error)
weighted_error = tf.reduce_mean(input_tensor=importance_weights_ph * errors)
# compute optimization op (potentially with gradient clipping)
if grad_norm_clipping is not None:
gradients = optimizer.compute_gradients(weighted_error, var_list=q_func_vars)
for i, (grad, var) in enumerate(gradients):
if grad is not None:
gradients[i] = (tf.clip_by_norm(grad, grad_norm_clipping), var)
optimize_expr = optimizer.apply_gradients(gradients)
else:
optimize_expr = optimizer.minimize(weighted_error, var_list=q_func_vars)
# update_target_fn will be called periodically to copy Q network to target Q network
update_target_expr = []
for var, var_target in zip(sorted(q_func_vars, key=lambda v: v.name),
sorted(target_q_func_vars, key=lambda v: v.name)):
update_target_expr.append(var_target.assign(var))
update_target_expr = tf.group(*update_target_expr)
# Create callable functions
train = U.function(
inputs=[
obs_t_input,
act_t_ph,
rew_t_ph,
obs_tp1_input,
done_mask_ph,
importance_weights_ph
],
outputs=td_error,
updates=[optimize_expr]
)
update_target = U.function([], [], updates=[update_target_expr])
q_values = U.function([obs_t_input], q_t)
return act_f, train, update_target, {'q_values': q_values}
|