| | """Deep Q learning graph |
| | |
| | The functions in this file can are used to create the following functions: |
| | |
| | ======= act ======== |
| | |
| | Function to chose an action given an observation |
| | |
| | Parameters |
| | ---------- |
| | observation: object |
| | Observation that can be feed into the output of make_obs_ph |
| | stochastic: bool |
| | if set to False all the actions are always deterministic (default False) |
| | update_eps_ph: float |
| | update epsilon a new value, if negative no update happens |
| | (default: no update) |
| | |
| | Returns |
| | ------- |
| | Tensor of dtype tf.int64 and shape (BATCH_SIZE,) with an action to be performed for |
| | every element of the batch. |
| | |
| | |
| | ======= act (in case of parameter noise) ======== |
| | |
| | Function to chose an action given an observation |
| | |
| | Parameters |
| | ---------- |
| | observation: object |
| | Observation that can be feed into the output of make_obs_ph |
| | stochastic: bool |
| | if set to False all the actions are always deterministic (default False) |
| | update_eps_ph: float |
| | update epsilon to a new value, if negative no update happens |
| | (default: no update) |
| | reset_ph: bool |
| | reset the perturbed policy by sampling a new perturbation |
| | update_param_noise_threshold_ph: float |
| | the desired threshold for the difference between non-perturbed and perturbed policy |
| | update_param_noise_scale_ph: bool |
| | whether or not to update the scale of the noise for the next time it is re-perturbed |
| | |
| | Returns |
| | ------- |
| | Tensor of dtype tf.int64 and shape (BATCH_SIZE,) with an action to be performed for |
| | every element of the batch. |
| | |
| | |
| | ======= train ======= |
| | |
| | Function that takes a transition (s,a,r,s') and optimizes Bellman equation's error: |
| | |
| | td_error = Q(s,a) - (r + gamma * max_a' Q(s', a')) |
| | loss = huber_loss[td_error] |
| | |
| | Parameters |
| | ---------- |
| | obs_t: object |
| | a batch of observations |
| | action: np.array |
| | actions that were selected upon seeing obs_t. |
| | dtype must be int32 and shape must be (batch_size,) |
| | reward: np.array |
| | immediate reward attained after executing those actions |
| | dtype must be float32 and shape must be (batch_size,) |
| | obs_tp1: object |
| | observations that followed obs_t |
| | done: np.array |
| | 1 if obs_t was the last observation in the episode and 0 otherwise |
| | obs_tp1 gets ignored, but must be of the valid shape. |
| | dtype must be float32 and shape must be (batch_size,) |
| | weight: np.array |
| | imporance weights for every element of the batch (gradient is multiplied |
| | by the importance weight) dtype must be float32 and shape must be (batch_size,) |
| | |
| | Returns |
| | ------- |
| | td_error: np.array |
| | a list of differences between Q(s,a) and the target in Bellman's equation. |
| | dtype is float32 and shape is (batch_size,) |
| | |
| | ======= update_target ======== |
| | |
| | copy the parameters from optimized Q function to the target Q function. |
| | In Q learning we actually optimize the following error: |
| | |
| | Q(s,a) - (r + gamma * max_a' Q'(s', a')) |
| | |
| | Where Q' is lagging behind Q to stablize the learning. For example for Atari |
| | |
| | Q' is set to Q once every 10000 updates training steps. |
| | |
| | """ |
| | import tensorflow as tf |
| | import baselines.common.tf_util as U |
| |
|
| |
|
| | def scope_vars(scope, trainable_only=False): |
| | """ |
| | Get variables inside a scope |
| | The scope can be specified as a string |
| | Parameters |
| | ---------- |
| | scope: str or VariableScope |
| | scope in which the variables reside. |
| | trainable_only: bool |
| | whether or not to return only the variables that were marked as trainable. |
| | Returns |
| | ------- |
| | vars: [tf.Variable] |
| | list of variables in `scope`. |
| | """ |
| | return tf.compat.v1.get_collection( |
| | tf.compat.v1.GraphKeys.TRAINABLE_VARIABLES if trainable_only else tf.compat.v1.GraphKeys.GLOBAL_VARIABLES, |
| | scope=scope if isinstance(scope, str) else scope.name |
| | ) |
| |
|
| |
|
| | def scope_name(): |
| | """Returns the name of current scope as a string, e.g. deepq/q_func""" |
| | return tf.compat.v1.get_variable_scope().name |
| |
|
| |
|
| | def absolute_scope_name(relative_scope_name): |
| | """Appends parent scope name to `relative_scope_name`""" |
| | return scope_name() + "/" + relative_scope_name |
| |
|
| |
|
| | def default_param_noise_filter(var): |
| | if var not in tf.compat.v1.trainable_variables(): |
| | |
| | return False |
| | if "fully_connected" in var.name: |
| | |
| | return True |
| |
|
| | |
| | |
| | |
| | |
| | return False |
| |
|
| |
|
| | def build_act(make_obs_ph, q_func, num_actions, scope="deepq", reuse=None): |
| | """Creates the act function: |
| | |
| | Parameters |
| | ---------- |
| | make_obs_ph: str -> tf.placeholder or TfInput |
| | a function that take a name and creates a placeholder of input with that name |
| | q_func: (tf.Variable, int, str, bool) -> tf.Variable |
| | the model that takes the following inputs: |
| | observation_in: object |
| | the output of observation placeholder |
| | num_actions: int |
| | number of actions |
| | scope: str |
| | reuse: bool |
| | should be passed to outer variable scope |
| | and returns a tensor of shape (batch_size, num_actions) with values of every action. |
| | num_actions: int |
| | number of actions. |
| | scope: str or VariableScope |
| | optional scope for variable_scope. |
| | reuse: bool or None |
| | whether or not the variables should be reused. To be able to reuse the scope must be given. |
| | |
| | Returns |
| | ------- |
| | act: (tf.Variable, bool, float) -> tf.Variable |
| | function to select and action given observation. |
| | ` See the top of the file for details. |
| | """ |
| | with tf.compat.v1.variable_scope(scope, reuse=reuse): |
| | observations_ph = make_obs_ph("observation") |
| | stochastic_ph = tf.compat.v1.placeholder(tf.bool, (), name="stochastic") |
| | update_eps_ph = tf.compat.v1.placeholder(tf.float32, (), name="update_eps") |
| |
|
| | eps = tf.compat.v1.get_variable("eps", (), initializer=tf.compat.v1.constant_initializer(0)) |
| |
|
| | q_values = q_func(observations_ph.get(), num_actions, scope="q_func") |
| | deterministic_actions = tf.argmax(input=q_values, axis=1) |
| |
|
| | batch_size = tf.shape(input=observations_ph.get())[0] |
| | random_actions = tf.random.uniform(tf.stack([batch_size]), minval=0, maxval=num_actions, dtype=tf.int64) |
| | chose_random = tf.random.uniform(tf.stack([batch_size]), minval=0, maxval=1, dtype=tf.float32) < eps |
| | stochastic_actions = tf.compat.v1.where(chose_random, random_actions, deterministic_actions) |
| |
|
| | output_actions = tf.cond(pred=stochastic_ph, true_fn=lambda: stochastic_actions, false_fn=lambda: deterministic_actions) |
| | update_eps_expr = eps.assign(tf.cond(pred=update_eps_ph >= 0, true_fn=lambda: update_eps_ph, false_fn=lambda: eps)) |
| | _act = U.function(inputs=[observations_ph, stochastic_ph, update_eps_ph], |
| | outputs=output_actions, |
| | givens={update_eps_ph: -1.0, stochastic_ph: True}, |
| | updates=[update_eps_expr]) |
| | def act(ob, stochastic=True, update_eps=-1): |
| | return _act(ob, stochastic, update_eps) |
| | return act |
| |
|
| |
|
| | def build_act_with_param_noise(make_obs_ph, q_func, num_actions, scope="deepq", reuse=None, param_noise_filter_func=None): |
| | """Creates the act function with support for parameter space noise exploration (https://arxiv.org/abs/1706.01905): |
| | |
| | Parameters |
| | ---------- |
| | make_obs_ph: str -> tf.placeholder or TfInput |
| | a function that take a name and creates a placeholder of input with that name |
| | q_func: (tf.Variable, int, str, bool) -> tf.Variable |
| | the model that takes the following inputs: |
| | observation_in: object |
| | the output of observation placeholder |
| | num_actions: int |
| | number of actions |
| | scope: str |
| | reuse: bool |
| | should be passed to outer variable scope |
| | and returns a tensor of shape (batch_size, num_actions) with values of every action. |
| | num_actions: int |
| | number of actions. |
| | scope: str or VariableScope |
| | optional scope for variable_scope. |
| | reuse: bool or None |
| | whether or not the variables should be reused. To be able to reuse the scope must be given. |
| | param_noise_filter_func: tf.Variable -> bool |
| | function that decides whether or not a variable should be perturbed. Only applicable |
| | if param_noise is True. If set to None, default_param_noise_filter is used by default. |
| | |
| | Returns |
| | ------- |
| | act: (tf.Variable, bool, float, bool, float, bool) -> tf.Variable |
| | function to select and action given observation. |
| | ` See the top of the file for details. |
| | """ |
| | if param_noise_filter_func is None: |
| | param_noise_filter_func = default_param_noise_filter |
| |
|
| | with tf.compat.v1.variable_scope(scope, reuse=reuse): |
| | observations_ph = make_obs_ph("observation") |
| | stochastic_ph = tf.compat.v1.placeholder(tf.bool, (), name="stochastic") |
| | update_eps_ph = tf.compat.v1.placeholder(tf.float32, (), name="update_eps") |
| | update_param_noise_threshold_ph = tf.compat.v1.placeholder(tf.float32, (), name="update_param_noise_threshold") |
| | update_param_noise_scale_ph = tf.compat.v1.placeholder(tf.bool, (), name="update_param_noise_scale") |
| | reset_ph = tf.compat.v1.placeholder(tf.bool, (), name="reset") |
| |
|
| | eps = tf.compat.v1.get_variable("eps", (), initializer=tf.compat.v1.constant_initializer(0)) |
| | param_noise_scale = tf.compat.v1.get_variable("param_noise_scale", (), initializer=tf.compat.v1.constant_initializer(0.01), trainable=False) |
| | param_noise_threshold = tf.compat.v1.get_variable("param_noise_threshold", (), initializer=tf.compat.v1.constant_initializer(0.05), trainable=False) |
| |
|
| | |
| | q_values = q_func(observations_ph.get(), num_actions, scope="q_func") |
| |
|
| | |
| | q_values_perturbed = q_func(observations_ph.get(), num_actions, scope="perturbed_q_func") |
| | |
| | |
| | |
| | def perturb_vars(original_scope, perturbed_scope): |
| | all_vars = scope_vars(absolute_scope_name(original_scope)) |
| | all_perturbed_vars = scope_vars(absolute_scope_name(perturbed_scope)) |
| | assert len(all_vars) == len(all_perturbed_vars) |
| | perturb_ops = [] |
| | for var, perturbed_var in zip(all_vars, all_perturbed_vars): |
| | if param_noise_filter_func(perturbed_var): |
| | |
| | op = tf.compat.v1.assign(perturbed_var, var + tf.random.normal(shape=tf.shape(input=var), mean=0., stddev=param_noise_scale)) |
| | else: |
| | |
| | op = tf.compat.v1.assign(perturbed_var, var) |
| | perturb_ops.append(op) |
| | assert len(perturb_ops) == len(all_vars) |
| | return tf.group(*perturb_ops) |
| |
|
| | |
| | |
| | |
| | q_values_adaptive = q_func(observations_ph.get(), num_actions, scope="adaptive_q_func") |
| | perturb_for_adaption = perturb_vars(original_scope="q_func", perturbed_scope="adaptive_q_func") |
| | kl = tf.reduce_sum(input_tensor=tf.nn.softmax(q_values) * (tf.math.log(tf.nn.softmax(q_values)) - tf.math.log(tf.nn.softmax(q_values_adaptive))), axis=-1) |
| | mean_kl = tf.reduce_mean(input_tensor=kl) |
| | def update_scale(): |
| | with tf.control_dependencies([perturb_for_adaption]): |
| | update_scale_expr = tf.cond(pred=mean_kl < param_noise_threshold, |
| | true_fn=lambda: param_noise_scale.assign(param_noise_scale * 1.01), |
| | false_fn=lambda: param_noise_scale.assign(param_noise_scale / 1.01), |
| | ) |
| | return update_scale_expr |
| |
|
| | |
| | update_param_noise_threshold_expr = param_noise_threshold.assign(tf.cond(pred=update_param_noise_threshold_ph >= 0, |
| | true_fn=lambda: update_param_noise_threshold_ph, false_fn=lambda: param_noise_threshold)) |
| |
|
| | |
| | deterministic_actions = tf.argmax(input=q_values_perturbed, axis=1) |
| | batch_size = tf.shape(input=observations_ph.get())[0] |
| | random_actions = tf.random.uniform(tf.stack([batch_size]), minval=0, maxval=num_actions, dtype=tf.int64) |
| | chose_random = tf.random.uniform(tf.stack([batch_size]), minval=0, maxval=1, dtype=tf.float32) < eps |
| | stochastic_actions = tf.compat.v1.where(chose_random, random_actions, deterministic_actions) |
| |
|
| | output_actions = tf.cond(pred=stochastic_ph, true_fn=lambda: stochastic_actions, false_fn=lambda: deterministic_actions) |
| | update_eps_expr = eps.assign(tf.cond(pred=update_eps_ph >= 0, true_fn=lambda: update_eps_ph, false_fn=lambda: eps)) |
| | updates = [ |
| | update_eps_expr, |
| | tf.cond(pred=reset_ph, true_fn=lambda: perturb_vars(original_scope="q_func", perturbed_scope="perturbed_q_func"), false_fn=lambda: tf.group(*[])), |
| | tf.cond(pred=update_param_noise_scale_ph, true_fn=lambda: update_scale(), false_fn=lambda: tf.Variable(0., trainable=False)), |
| | update_param_noise_threshold_expr, |
| | ] |
| | _act = U.function(inputs=[observations_ph, stochastic_ph, update_eps_ph, reset_ph, update_param_noise_threshold_ph, update_param_noise_scale_ph], |
| | outputs=output_actions, |
| | givens={update_eps_ph: -1.0, stochastic_ph: True, reset_ph: False, update_param_noise_threshold_ph: False, update_param_noise_scale_ph: False}, |
| | updates=updates) |
| | def act(ob, reset=False, update_param_noise_threshold=False, update_param_noise_scale=False, stochastic=True, update_eps=-1): |
| | return _act(ob, stochastic, update_eps, reset, update_param_noise_threshold, update_param_noise_scale) |
| | return act |
| |
|
| |
|
| | def build_train(make_obs_ph, q_func, num_actions, optimizer, grad_norm_clipping=None, gamma=1.0, |
| | double_q=True, scope="deepq", reuse=None, param_noise=False, param_noise_filter_func=None): |
| | """Creates the train function: |
| | |
| | Parameters |
| | ---------- |
| | make_obs_ph: str -> tf.placeholder or TfInput |
| | a function that takes a name and creates a placeholder of input with that name |
| | q_func: (tf.Variable, int, str, bool) -> tf.Variable |
| | the model that takes the following inputs: |
| | observation_in: object |
| | the output of observation placeholder |
| | num_actions: int |
| | number of actions |
| | scope: str |
| | reuse: bool |
| | should be passed to outer variable scope |
| | and returns a tensor of shape (batch_size, num_actions) with values of every action. |
| | num_actions: int |
| | number of actions |
| | reuse: bool |
| | whether or not to reuse the graph variables |
| | optimizer: tf.train.Optimizer |
| | optimizer to use for the Q-learning objective. |
| | grad_norm_clipping: float or None |
| | clip gradient norms to this value. If None no clipping is performed. |
| | gamma: float |
| | discount rate. |
| | double_q: bool |
| | if true will use Double Q Learning (https://arxiv.org/abs/1509.06461). |
| | In general it is a good idea to keep it enabled. |
| | scope: str or VariableScope |
| | optional scope for variable_scope. |
| | reuse: bool or None |
| | whether or not the variables should be reused. To be able to reuse the scope must be given. |
| | param_noise: bool |
| | whether or not to use parameter space noise (https://arxiv.org/abs/1706.01905) |
| | param_noise_filter_func: tf.Variable -> bool |
| | function that decides whether or not a variable should be perturbed. Only applicable |
| | if param_noise is True. If set to None, default_param_noise_filter is used by default. |
| | |
| | Returns |
| | ------- |
| | act: (tf.Variable, bool, float) -> tf.Variable |
| | function to select and action given observation. |
| | ` See the top of the file for details. |
| | train: (object, np.array, np.array, object, np.array, np.array) -> np.array |
| | optimize the error in Bellman's equation. |
| | ` See the top of the file for details. |
| | update_target: () -> () |
| | copy the parameters from optimized Q function to the target Q function. |
| | ` See the top of the file for details. |
| | debug: {str: function} |
| | a bunch of functions to print debug data like q_values. |
| | """ |
| | if param_noise: |
| | act_f = build_act_with_param_noise(make_obs_ph, q_func, num_actions, scope=scope, reuse=reuse, |
| | param_noise_filter_func=param_noise_filter_func) |
| | else: |
| | act_f = build_act(make_obs_ph, q_func, num_actions, scope=scope, reuse=reuse) |
| |
|
| | with tf.compat.v1.variable_scope(scope, reuse=reuse): |
| | |
| | obs_t_input = make_obs_ph("obs_t") |
| | act_t_ph = tf.compat.v1.placeholder(tf.int32, [None], name="action") |
| | rew_t_ph = tf.compat.v1.placeholder(tf.float32, [None], name="reward") |
| | obs_tp1_input = make_obs_ph("obs_tp1") |
| | done_mask_ph = tf.compat.v1.placeholder(tf.float32, [None], name="done") |
| | importance_weights_ph = tf.compat.v1.placeholder(tf.float32, [None], name="weight") |
| |
|
| | |
| | q_t = q_func(obs_t_input.get(), num_actions, scope="q_func", reuse=True) |
| | q_func_vars = tf.compat.v1.get_collection(tf.compat.v1.GraphKeys.GLOBAL_VARIABLES, scope=tf.compat.v1.get_variable_scope().name + "/q_func") |
| |
|
| | |
| | q_tp1 = q_func(obs_tp1_input.get(), num_actions, scope="target_q_func") |
| | target_q_func_vars = tf.compat.v1.get_collection(tf.compat.v1.GraphKeys.GLOBAL_VARIABLES, scope=tf.compat.v1.get_variable_scope().name + "/target_q_func") |
| |
|
| | |
| | q_t_selected = tf.reduce_sum(input_tensor=q_t * tf.one_hot(act_t_ph, num_actions), axis=1) |
| |
|
| | |
| | if double_q: |
| | q_tp1_using_online_net = q_func(obs_tp1_input.get(), num_actions, scope="q_func", reuse=True) |
| | q_tp1_best_using_online_net = tf.argmax(input=q_tp1_using_online_net, axis=1) |
| | q_tp1_best = tf.reduce_sum(input_tensor=q_tp1 * tf.one_hot(q_tp1_best_using_online_net, num_actions), axis=1) |
| | else: |
| | q_tp1_best = tf.reduce_max(input_tensor=q_tp1, axis=1) |
| | q_tp1_best_masked = (1.0 - done_mask_ph) * q_tp1_best |
| |
|
| | |
| | q_t_selected_target = rew_t_ph + gamma * q_tp1_best_masked |
| |
|
| | |
| | td_error = q_t_selected - tf.stop_gradient(q_t_selected_target) |
| | errors = U.huber_loss(td_error) |
| | weighted_error = tf.reduce_mean(input_tensor=importance_weights_ph * errors) |
| |
|
| | |
| | if grad_norm_clipping is not None: |
| | gradients = optimizer.compute_gradients(weighted_error, var_list=q_func_vars) |
| | for i, (grad, var) in enumerate(gradients): |
| | if grad is not None: |
| | gradients[i] = (tf.clip_by_norm(grad, grad_norm_clipping), var) |
| | optimize_expr = optimizer.apply_gradients(gradients) |
| | else: |
| | optimize_expr = optimizer.minimize(weighted_error, var_list=q_func_vars) |
| |
|
| | |
| | update_target_expr = [] |
| | for var, var_target in zip(sorted(q_func_vars, key=lambda v: v.name), |
| | sorted(target_q_func_vars, key=lambda v: v.name)): |
| | update_target_expr.append(var_target.assign(var)) |
| | update_target_expr = tf.group(*update_target_expr) |
| |
|
| | |
| | train = U.function( |
| | inputs=[ |
| | obs_t_input, |
| | act_t_ph, |
| | rew_t_ph, |
| | obs_tp1_input, |
| | done_mask_ph, |
| | importance_weights_ph |
| | ], |
| | outputs=td_error, |
| | updates=[optimize_expr] |
| | ) |
| | update_target = U.function([], [], updates=[update_target_expr]) |
| |
|
| | q_values = U.function([obs_t_input], q_t) |
| |
|
| | return act_f, train, update_target, {'q_values': q_values} |
| |
|