|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| """Reward shaping functions used by Contexts.
|
|
|
| Each reward function should take the following inputs and return new rewards,
|
| and discounts.
|
|
|
| new_rewards, discounts = reward_fn(states, actions, rewards,
|
| next_states, contexts)
|
| """
|
|
|
| from __future__ import absolute_import
|
| from __future__ import division
|
| from __future__ import print_function
|
|
|
| import tensorflow as tf
|
| import gin.tf
|
|
|
|
|
| def summarize_stats(stats):
|
| """Summarize a dictionary of variables.
|
|
|
| Args:
|
| stats: a dictionary of {name: tensor} to compute stats over.
|
| """
|
| for name, stat in stats.items():
|
| mean = tf.reduce_mean(stat)
|
| tf.summary.scalar('mean_%s' % name, mean)
|
| tf.summary.scalar('max_%s' % name, tf.reduce_max(stat))
|
| tf.summary.scalar('min_%s' % name, tf.reduce_min(stat))
|
| std = tf.sqrt(tf.reduce_mean(tf.square(stat)) - tf.square(mean) + 1e-10)
|
| tf.summary.scalar('std_%s' % name, std)
|
| tf.summary.histogram(name, stat)
|
|
|
|
|
| def index_states(states, indices):
|
| """Return indexed states.
|
|
|
| Args:
|
| states: A [batch_size, num_state_dims] Tensor representing a batch
|
| of states.
|
| indices: (a list of Numpy integer array) Indices of states dimensions
|
| to be mapped.
|
| Returns:
|
| A [batch_size, num_indices] Tensor representing the batch of indexed states.
|
| """
|
| if indices is None:
|
| return states
|
| indices = tf.constant(indices, dtype=tf.int32)
|
| return tf.gather(states, indices=indices, axis=1)
|
|
|
|
|
| def record_tensor(tensor, indices, stats, name='states'):
|
| """Record specified tensor dimensions into stats.
|
|
|
| Args:
|
| tensor: A [batch_size, num_dims] Tensor.
|
| indices: (a list of integers) Indices of dimensions to record.
|
| stats: A dictionary holding stats.
|
| name: (string) Name of tensor.
|
| """
|
| if indices is None:
|
| indices = range(tensor.shape.as_list()[1])
|
| for index in indices:
|
| stats['%s_%02d' % (name, index)] = tensor[:, index]
|
|
|
|
|
| @gin.configurable
|
| def potential_rewards(states,
|
| actions,
|
| rewards,
|
| next_states,
|
| contexts,
|
| gamma=1.0,
|
| reward_fn=None):
|
| """Return the potential-based rewards.
|
|
|
| Args:
|
| states: A [batch_size, num_state_dims] Tensor representing a batch
|
| of states.
|
| actions: A [batch_size, num_action_dims] Tensor representing a batch
|
| of actions.
|
| rewards: A [batch_size] Tensor representing a batch of rewards.
|
| next_states: A [batch_size, num_state_dims] Tensor representing a batch
|
| of next states.
|
| contexts: A list of [batch_size, num_context_dims] Tensor representing
|
| a batch of contexts.
|
| gamma: Reward discount.
|
| reward_fn: A reward function.
|
| Returns:
|
| A new tf.float32 [batch_size] rewards Tensor, and
|
| tf.float32 [batch_size] discounts tensor.
|
| """
|
| del actions
|
| gamma = tf.to_float(gamma)
|
| rewards_tp1, discounts = reward_fn(None, None, rewards, next_states, contexts)
|
| rewards, _ = reward_fn(None, None, rewards, states, contexts)
|
| return -rewards + gamma * rewards_tp1, discounts
|
|
|
|
|
| @gin.configurable
|
| def timed_rewards(states,
|
| actions,
|
| rewards,
|
| next_states,
|
| contexts,
|
| reward_fn=None,
|
| dense=False,
|
| timer_index=-1):
|
| """Return the timed rewards.
|
|
|
| Args:
|
| states: A [batch_size, num_state_dims] Tensor representing a batch
|
| of states.
|
| actions: A [batch_size, num_action_dims] Tensor representing a batch
|
| of actions.
|
| rewards: A [batch_size] Tensor representing a batch of rewards.
|
| next_states: A [batch_size, num_state_dims] Tensor representing a batch
|
| of next states.
|
| contexts: A list of [batch_size, num_context_dims] Tensor representing
|
| a batch of contexts.
|
| reward_fn: A reward function.
|
| dense: (boolean) Provide dense rewards or sparse rewards at time = 0.
|
| timer_index: (integer) The context list index that specifies timer.
|
| Returns:
|
| A new tf.float32 [batch_size] rewards Tensor, and
|
| tf.float32 [batch_size] discounts tensor.
|
| """
|
| assert contexts[timer_index].get_shape().as_list()[1] == 1
|
| timers = contexts[timer_index][:, 0]
|
| rewards, discounts = reward_fn(states, actions, rewards, next_states,
|
| contexts)
|
| terminates = tf.to_float(timers <= 0)
|
| for _ in range(rewards.shape.ndims - 1):
|
| terminates = tf.expand_dims(terminates, axis=-1)
|
| if not dense:
|
| rewards *= terminates
|
| discounts *= (tf.to_float(1.0) - terminates)
|
| return rewards, discounts
|
|
|
|
|
| @gin.configurable
|
| def reset_rewards(states,
|
| actions,
|
| rewards,
|
| next_states,
|
| contexts,
|
| reset_index=0,
|
| reset_state=None,
|
| reset_reward_function=None,
|
| include_forward_rewards=True,
|
| include_reset_rewards=True):
|
| """Returns the rewards for a forward/reset agent.
|
|
|
| Args:
|
| states: A [batch_size, num_state_dims] Tensor representing a batch
|
| of states.
|
| actions: A [batch_size, num_action_dims] Tensor representing a batch
|
| of actions.
|
| rewards: A [batch_size] Tensor representing a batch of rewards.
|
| next_states: A [batch_size, num_state_dims] Tensor representing a batch
|
| of next states.
|
| contexts: A list of [batch_size, num_context_dims] Tensor representing
|
| a batch of contexts.
|
| reset_index: (integer) The context list index that specifies reset.
|
| reset_state: Reset state.
|
| reset_reward_function: Reward function for reset step.
|
| include_forward_rewards: Include the rewards from the forward pass.
|
| include_reset_rewards: Include the rewards from the reset pass.
|
|
|
| Returns:
|
| A new tf.float32 [batch_size] rewards Tensor, and
|
| tf.float32 [batch_size] discounts tensor.
|
| """
|
| reset_state = tf.constant(
|
| reset_state, dtype=next_states.dtype, shape=next_states.shape)
|
| reset_states = tf.expand_dims(reset_state, 0)
|
|
|
| def true_fn():
|
| if include_reset_rewards:
|
| return reset_reward_function(states, actions, rewards, next_states,
|
| [reset_states] + contexts[1:])
|
| else:
|
| return tf.zeros_like(rewards), tf.ones_like(rewards)
|
|
|
| def false_fn():
|
| if include_forward_rewards:
|
| return plain_rewards(states, actions, rewards, next_states, contexts)
|
| else:
|
| return tf.zeros_like(rewards), tf.ones_like(rewards)
|
|
|
| rewards, discounts = tf.cond(
|
| tf.cast(contexts[reset_index][0, 0], dtype=tf.bool), true_fn, false_fn)
|
| return rewards, discounts
|
|
|
|
|
| @gin.configurable
|
| def tanh_similarity(states,
|
| actions,
|
| rewards,
|
| next_states,
|
| contexts,
|
| mse_scale=1.0,
|
| state_scales=1.0,
|
| goal_scales=1.0,
|
| summarize=False):
|
| """Returns the similarity between next_states and contexts using tanh and mse.
|
|
|
| Args:
|
| states: A [batch_size, num_state_dims] Tensor representing a batch
|
| of states.
|
| actions: A [batch_size, num_action_dims] Tensor representing a batch
|
| of actions.
|
| rewards: A [batch_size] Tensor representing a batch of rewards.
|
| next_states: A [batch_size, num_state_dims] Tensor representing a batch
|
| of next states.
|
| contexts: A list of [batch_size, num_context_dims] Tensor representing
|
| a batch of contexts.
|
| mse_scale: A float, to scale mse before tanh.
|
| state_scales: multiplicative scale for (next) states. A scalar or 1D tensor,
|
| must be broadcastable to number of state dimensions.
|
| goal_scales: multiplicative scale for contexts. A scalar or 1D tensor,
|
| must be broadcastable to number of goal dimensions.
|
| summarize: (boolean) enable summary ops.
|
|
|
|
|
| Returns:
|
| A new tf.float32 [batch_size] rewards Tensor, and
|
| tf.float32 [batch_size] discounts tensor.
|
| """
|
| del states, actions, rewards
|
| mse = tf.reduce_mean(tf.squared_difference(next_states * state_scales,
|
| contexts[0] * goal_scales), -1)
|
| tanh = tf.tanh(mse_scale * mse)
|
| if summarize:
|
| with tf.name_scope('RewardFn/'):
|
| tf.summary.scalar('mean_mse', tf.reduce_mean(mse))
|
| tf.summary.histogram('mse', mse)
|
| tf.summary.scalar('mean_tanh', tf.reduce_mean(tanh))
|
| tf.summary.histogram('tanh', tanh)
|
| rewards = tf.to_float(1 - tanh)
|
| return rewards, tf.ones_like(rewards)
|
|
|
|
|
| @gin.configurable
|
| def negative_mse(states,
|
| actions,
|
| rewards,
|
| next_states,
|
| contexts,
|
| state_scales=1.0,
|
| goal_scales=1.0,
|
| summarize=False):
|
| """Returns the negative mean square error between next_states and contexts.
|
|
|
| Args:
|
| states: A [batch_size, num_state_dims] Tensor representing a batch
|
| of states.
|
| actions: A [batch_size, num_action_dims] Tensor representing a batch
|
| of actions.
|
| rewards: A [batch_size] Tensor representing a batch of rewards.
|
| next_states: A [batch_size, num_state_dims] Tensor representing a batch
|
| of next states.
|
| contexts: A list of [batch_size, num_context_dims] Tensor representing
|
| a batch of contexts.
|
| state_scales: multiplicative scale for (next) states. A scalar or 1D tensor,
|
| must be broadcastable to number of state dimensions.
|
| goal_scales: multiplicative scale for contexts. A scalar or 1D tensor,
|
| must be broadcastable to number of goal dimensions.
|
| summarize: (boolean) enable summary ops.
|
|
|
| Returns:
|
| A new tf.float32 [batch_size] rewards Tensor, and
|
| tf.float32 [batch_size] discounts tensor.
|
| """
|
| del states, actions, rewards
|
| mse = tf.reduce_mean(tf.squared_difference(next_states * state_scales,
|
| contexts[0] * goal_scales), -1)
|
| if summarize:
|
| with tf.name_scope('RewardFn/'):
|
| tf.summary.scalar('mean_mse', tf.reduce_mean(mse))
|
| tf.summary.histogram('mse', mse)
|
| rewards = tf.to_float(-mse)
|
| return rewards, tf.ones_like(rewards)
|
|
|
|
|
| @gin.configurable
|
| def negative_distance(states,
|
| actions,
|
| rewards,
|
| next_states,
|
| contexts,
|
| state_scales=1.0,
|
| goal_scales=1.0,
|
| reward_scales=1.0,
|
| weight_index=None,
|
| weight_vector=None,
|
| summarize=False,
|
| termination_epsilon=1e-4,
|
| state_indices=None,
|
| goal_indices=None,
|
| vectorize=False,
|
| relative_context=False,
|
| diff=False,
|
| norm='L2',
|
| epsilon=1e-10,
|
| bonus_epsilon=0.,
|
| offset=0.0):
|
| """Returns the negative euclidean distance between next_states and contexts.
|
|
|
| Args:
|
| states: A [batch_size, num_state_dims] Tensor representing a batch
|
| of states.
|
| actions: A [batch_size, num_action_dims] Tensor representing a batch
|
| of actions.
|
| rewards: A [batch_size] Tensor representing a batch of rewards.
|
| next_states: A [batch_size, num_state_dims] Tensor representing a batch
|
| of next states.
|
| contexts: A list of [batch_size, num_context_dims] Tensor representing
|
| a batch of contexts.
|
| state_scales: multiplicative scale for (next) states. A scalar or 1D tensor,
|
| must be broadcastable to number of state dimensions.
|
| goal_scales: multiplicative scale for goals. A scalar or 1D tensor,
|
| must be broadcastable to number of goal dimensions.
|
| reward_scales: multiplicative scale for rewards. A scalar or 1D tensor,
|
| must be broadcastable to number of reward dimensions.
|
| weight_index: (integer) The context list index that specifies weight.
|
| weight_vector: (a number or a list or Numpy array) The weighting vector,
|
| broadcastable to `next_states`.
|
| summarize: (boolean) enable summary ops.
|
| termination_epsilon: terminate if dist is less than this quantity.
|
| state_indices: (a list of integers) list of state indices to select.
|
| goal_indices: (a list of integers) list of goal indices to select.
|
| vectorize: Return a vectorized form.
|
| norm: L1 or L2.
|
| epsilon: small offset to ensure non-negative/zero distance.
|
|
|
| Returns:
|
| A new tf.float32 [batch_size] rewards Tensor, and
|
| tf.float32 [batch_size] discounts tensor.
|
| """
|
| del actions, rewards
|
| stats = {}
|
| record_tensor(next_states, state_indices, stats, 'next_states')
|
| states = index_states(states, state_indices)
|
| next_states = index_states(next_states, state_indices)
|
| goals = index_states(contexts[0], goal_indices)
|
| if relative_context:
|
| goals = states + goals
|
| sq_dists = tf.squared_difference(next_states * state_scales,
|
| goals * goal_scales)
|
| old_sq_dists = tf.squared_difference(states * state_scales,
|
| goals * goal_scales)
|
| record_tensor(sq_dists, None, stats, 'sq_dists')
|
| if weight_vector is not None:
|
| sq_dists *= tf.convert_to_tensor(weight_vector, dtype=next_states.dtype)
|
| old_sq_dists *= tf.convert_to_tensor(weight_vector, dtype=next_states.dtype)
|
| if weight_index is not None:
|
|
|
| weights = tf.abs(index_states(contexts[0], weight_index))
|
|
|
| sq_dists *= weights
|
| old_sq_dists *= weights
|
| if norm == 'L1':
|
| dist = tf.sqrt(sq_dists + epsilon)
|
| old_dist = tf.sqrt(old_sq_dists + epsilon)
|
| if not vectorize:
|
| dist = tf.reduce_sum(dist, -1)
|
| old_dist = tf.reduce_sum(old_dist, -1)
|
| elif norm == 'L2':
|
| if vectorize:
|
| dist = sq_dists
|
| old_dist = old_sq_dists
|
| else:
|
| dist = tf.reduce_sum(sq_dists, -1)
|
| old_dist = tf.reduce_sum(old_sq_dists, -1)
|
| dist = tf.sqrt(dist + epsilon)
|
| old_dist = tf.sqrt(old_dist + epsilon)
|
| else:
|
| raise NotImplementedError(norm)
|
| discounts = dist > termination_epsilon
|
| if summarize:
|
| with tf.name_scope('RewardFn/'):
|
| tf.summary.scalar('mean_dist', tf.reduce_mean(dist))
|
| tf.summary.histogram('dist', dist)
|
| summarize_stats(stats)
|
| bonus = tf.to_float(dist < bonus_epsilon)
|
| dist *= reward_scales
|
| old_dist *= reward_scales
|
| if diff:
|
| return bonus + offset + tf.to_float(old_dist - dist), tf.to_float(discounts)
|
| return bonus + offset + tf.to_float(-dist), tf.to_float(discounts)
|
|
|
|
|
| @gin.configurable
|
| def cosine_similarity(states,
|
| actions,
|
| rewards,
|
| next_states,
|
| contexts,
|
| state_scales=1.0,
|
| goal_scales=1.0,
|
| reward_scales=1.0,
|
| normalize_states=True,
|
| normalize_goals=True,
|
| weight_index=None,
|
| weight_vector=None,
|
| summarize=False,
|
| state_indices=None,
|
| goal_indices=None,
|
| offset=0.0):
|
| """Returns the cosine similarity between next_states - states and contexts.
|
|
|
| Args:
|
| states: A [batch_size, num_state_dims] Tensor representing a batch
|
| of states.
|
| actions: A [batch_size, num_action_dims] Tensor representing a batch
|
| of actions.
|
| rewards: A [batch_size] Tensor representing a batch of rewards.
|
| next_states: A [batch_size, num_state_dims] Tensor representing a batch
|
| of next states.
|
| contexts: A list of [batch_size, num_context_dims] Tensor representing
|
| a batch of contexts.
|
| state_scales: multiplicative scale for (next) states. A scalar or 1D tensor,
|
| must be broadcastable to number of state dimensions.
|
| goal_scales: multiplicative scale for goals. A scalar or 1D tensor,
|
| must be broadcastable to number of goal dimensions.
|
| reward_scales: multiplicative scale for rewards. A scalar or 1D tensor,
|
| must be broadcastable to number of reward dimensions.
|
| weight_index: (integer) The context list index that specifies weight.
|
| weight_vector: (a number or a list or Numpy array) The weighting vector,
|
| broadcastable to `next_states`.
|
| summarize: (boolean) enable summary ops.
|
| termination_epsilon: terminate if dist is less than this quantity.
|
| state_indices: (a list of integers) list of state indices to select.
|
| goal_indices: (a list of integers) list of goal indices to select.
|
| vectorize: Return a vectorized form.
|
| norm: L1 or L2.
|
| epsilon: small offset to ensure non-negative/zero distance.
|
|
|
| Returns:
|
| A new tf.float32 [batch_size] rewards Tensor, and
|
| tf.float32 [batch_size] discounts tensor.
|
| """
|
| del actions, rewards
|
| stats = {}
|
| record_tensor(next_states, state_indices, stats, 'next_states')
|
| states = index_states(states, state_indices)
|
| next_states = index_states(next_states, state_indices)
|
| goals = index_states(contexts[0], goal_indices)
|
|
|
| if weight_vector is not None:
|
| goals *= tf.convert_to_tensor(weight_vector, dtype=next_states.dtype)
|
| if weight_index is not None:
|
| weights = tf.abs(index_states(contexts[0], weight_index))
|
| goals *= weights
|
|
|
| direction_vec = next_states - states
|
| if normalize_states:
|
| direction_vec = tf.nn.l2_normalize(direction_vec, -1)
|
| goal_vec = goals
|
| if normalize_goals:
|
| goal_vec = tf.nn.l2_normalize(goal_vec, -1)
|
|
|
| similarity = tf.reduce_sum(goal_vec * direction_vec, -1)
|
| discounts = tf.ones_like(similarity)
|
| return offset + tf.to_float(similarity), tf.to_float(discounts)
|
|
|
|
|
| @gin.configurable
|
| def diff_distance(states,
|
| actions,
|
| rewards,
|
| next_states,
|
| contexts,
|
| state_scales=1.0,
|
| goal_scales=1.0,
|
| reward_scales=1.0,
|
| weight_index=None,
|
| weight_vector=None,
|
| summarize=False,
|
| termination_epsilon=1e-4,
|
| state_indices=None,
|
| goal_indices=None,
|
| norm='L2',
|
| epsilon=1e-10):
|
| """Returns the difference in euclidean distance between states/next_states and contexts.
|
|
|
| Args:
|
| states: A [batch_size, num_state_dims] Tensor representing a batch
|
| of states.
|
| actions: A [batch_size, num_action_dims] Tensor representing a batch
|
| of actions.
|
| rewards: A [batch_size] Tensor representing a batch of rewards.
|
| next_states: A [batch_size, num_state_dims] Tensor representing a batch
|
| of next states.
|
| contexts: A list of [batch_size, num_context_dims] Tensor representing
|
| a batch of contexts.
|
| state_scales: multiplicative scale for (next) states. A scalar or 1D tensor,
|
| must be broadcastable to number of state dimensions.
|
| goal_scales: multiplicative scale for goals. A scalar or 1D tensor,
|
| must be broadcastable to number of goal dimensions.
|
| reward_scales: multiplicative scale for rewards. A scalar or 1D tensor,
|
| must be broadcastable to number of reward dimensions.
|
| weight_index: (integer) The context list index that specifies weight.
|
| weight_vector: (a number or a list or Numpy array) The weighting vector,
|
| broadcastable to `next_states`.
|
| summarize: (boolean) enable summary ops.
|
| termination_epsilon: terminate if dist is less than this quantity.
|
| state_indices: (a list of integers) list of state indices to select.
|
| goal_indices: (a list of integers) list of goal indices to select.
|
| vectorize: Return a vectorized form.
|
| norm: L1 or L2.
|
| epsilon: small offset to ensure non-negative/zero distance.
|
|
|
| Returns:
|
| A new tf.float32 [batch_size] rewards Tensor, and
|
| tf.float32 [batch_size] discounts tensor.
|
| """
|
| del actions, rewards
|
| stats = {}
|
| record_tensor(next_states, state_indices, stats, 'next_states')
|
| next_states = index_states(next_states, state_indices)
|
| states = index_states(states, state_indices)
|
| goals = index_states(contexts[0], goal_indices)
|
| next_sq_dists = tf.squared_difference(next_states * state_scales,
|
| goals * goal_scales)
|
| sq_dists = tf.squared_difference(states * state_scales,
|
| goals * goal_scales)
|
| record_tensor(sq_dists, None, stats, 'sq_dists')
|
| if weight_vector is not None:
|
| next_sq_dists *= tf.convert_to_tensor(weight_vector, dtype=next_states.dtype)
|
| sq_dists *= tf.convert_to_tensor(weight_vector, dtype=next_states.dtype)
|
| if weight_index is not None:
|
| next_sq_dists *= contexts[weight_index]
|
| sq_dists *= contexts[weight_index]
|
| if norm == 'L1':
|
| next_dist = tf.sqrt(next_sq_dists + epsilon)
|
| dist = tf.sqrt(sq_dists + epsilon)
|
| next_dist = tf.reduce_sum(next_dist, -1)
|
| dist = tf.reduce_sum(dist, -1)
|
| elif norm == 'L2':
|
| next_dist = tf.reduce_sum(next_sq_dists, -1)
|
| next_dist = tf.sqrt(next_dist + epsilon)
|
| dist = tf.reduce_sum(sq_dists, -1)
|
| dist = tf.sqrt(dist + epsilon)
|
| else:
|
| raise NotImplementedError(norm)
|
| discounts = next_dist > termination_epsilon
|
| if summarize:
|
| with tf.name_scope('RewardFn/'):
|
| tf.summary.scalar('mean_dist', tf.reduce_mean(dist))
|
| tf.summary.histogram('dist', dist)
|
| summarize_stats(stats)
|
| diff = dist - next_dist
|
| diff *= reward_scales
|
| return tf.to_float(diff), tf.to_float(discounts)
|
|
|
|
|
| @gin.configurable
|
| def binary_indicator(states,
|
| actions,
|
| rewards,
|
| next_states,
|
| contexts,
|
| termination_epsilon=1e-4,
|
| offset=0,
|
| epsilon=1e-10,
|
| state_indices=None,
|
| summarize=False):
|
| """Returns 0/1 by checking if next_states and contexts overlap.
|
|
|
| Args:
|
| states: A [batch_size, num_state_dims] Tensor representing a batch
|
| of states.
|
| actions: A [batch_size, num_action_dims] Tensor representing a batch
|
| of actions.
|
| rewards: A [batch_size] Tensor representing a batch of rewards.
|
| next_states: A [batch_size, num_state_dims] Tensor representing a batch
|
| of next states.
|
| contexts: A list of [batch_size, num_context_dims] Tensor representing
|
| a batch of contexts.
|
| termination_epsilon: terminate if dist is less than this quantity.
|
| offset: Offset the rewards.
|
| epsilon: small offset to ensure non-negative/zero distance.
|
|
|
| Returns:
|
| A new tf.float32 [batch_size] rewards Tensor, and
|
| tf.float32 [batch_size] discounts tensor.
|
| """
|
| del states, actions
|
| next_states = index_states(next_states, state_indices)
|
| dist = tf.reduce_sum(tf.squared_difference(next_states, contexts[0]), -1)
|
| dist = tf.sqrt(dist + epsilon)
|
| discounts = dist > termination_epsilon
|
| rewards = tf.logical_not(discounts)
|
| rewards = tf.to_float(rewards) + offset
|
| return tf.to_float(rewards), tf.ones_like(tf.to_float(discounts))
|
|
|
|
|
| @gin.configurable
|
| def plain_rewards(states, actions, rewards, next_states, contexts):
|
| """Returns the given rewards.
|
|
|
| Args:
|
| states: A [batch_size, num_state_dims] Tensor representing a batch
|
| of states.
|
| actions: A [batch_size, num_action_dims] Tensor representing a batch
|
| of actions.
|
| rewards: A [batch_size] Tensor representing a batch of rewards.
|
| next_states: A [batch_size, num_state_dims] Tensor representing a batch
|
| of next states.
|
| contexts: A list of [batch_size, num_context_dims] Tensor representing
|
| a batch of contexts.
|
|
|
| Returns:
|
| A new tf.float32 [batch_size] rewards Tensor, and
|
| tf.float32 [batch_size] discounts tensor.
|
| """
|
| del states, actions, next_states, contexts
|
| return rewards, tf.ones_like(rewards)
|
|
|
|
|
| @gin.configurable
|
| def ctrl_rewards(states,
|
| actions,
|
| rewards,
|
| next_states,
|
| contexts,
|
| reward_scales=1.0):
|
| """Returns the negative control cost.
|
|
|
| Args:
|
| states: A [batch_size, num_state_dims] Tensor representing a batch
|
| of states.
|
| actions: A [batch_size, num_action_dims] Tensor representing a batch
|
| of actions.
|
| rewards: A [batch_size] Tensor representing a batch of rewards.
|
| next_states: A [batch_size, num_state_dims] Tensor representing a batch
|
| of next states.
|
| contexts: A list of [batch_size, num_context_dims] Tensor representing
|
| a batch of contexts.
|
| reward_scales: multiplicative scale for rewards. A scalar or 1D tensor,
|
| must be broadcastable to number of reward dimensions.
|
|
|
| Returns:
|
| A new tf.float32 [batch_size] rewards Tensor, and
|
| tf.float32 [batch_size] discounts tensor.
|
| """
|
| del states, rewards, contexts
|
| if actions is None:
|
| rewards = tf.to_float(tf.zeros(shape=next_states.shape[:1]))
|
| else:
|
| rewards = -tf.reduce_sum(tf.square(actions), axis=1)
|
| rewards *= reward_scales
|
| rewards = tf.to_float(rewards)
|
| return rewards, tf.ones_like(rewards)
|
|
|
|
|
| @gin.configurable
|
| def diff_rewards(
|
| states,
|
| actions,
|
| rewards,
|
| next_states,
|
| contexts,
|
| state_indices=None,
|
| goal_index=0,):
|
| """Returns (next_states - goals) as a batched vector reward."""
|
| del states, rewards, actions
|
| if state_indices is not None:
|
| next_states = index_states(next_states, state_indices)
|
| rewards = tf.to_float(next_states - contexts[goal_index])
|
| return rewards, tf.ones_like(rewards)
|
|
|
|
|
| @gin.configurable
|
| def state_rewards(states,
|
| actions,
|
| rewards,
|
| next_states,
|
| contexts,
|
| weight_index=None,
|
| state_indices=None,
|
| weight_vector=1.0,
|
| offset_vector=0.0,
|
| summarize=False):
|
| """Returns the rewards that are linear mapping of next_states.
|
|
|
| Args:
|
| states: A [batch_size, num_state_dims] Tensor representing a batch
|
| of states.
|
| actions: A [batch_size, num_action_dims] Tensor representing a batch
|
| of actions.
|
| rewards: A [batch_size] Tensor representing a batch of rewards.
|
| next_states: A [batch_size, num_state_dims] Tensor representing a batch
|
| of next states.
|
| contexts: A list of [batch_size, num_context_dims] Tensor representing
|
| a batch of contexts.
|
| weight_index: (integer) Index of contexts lists that specify weighting.
|
| state_indices: (a list of Numpy integer array) Indices of states dimensions
|
| to be mapped.
|
| weight_vector: (a number or a list or Numpy array) The weighting vector,
|
| broadcastable to `next_states`.
|
| offset_vector: (a number or a list of Numpy array) The off vector.
|
| summarize: (boolean) enable summary ops.
|
|
|
| Returns:
|
| A new tf.float32 [batch_size] rewards Tensor, and
|
| tf.float32 [batch_size] discounts tensor.
|
| """
|
| del states, actions, rewards
|
| stats = {}
|
| record_tensor(next_states, state_indices, stats)
|
| next_states = index_states(next_states, state_indices)
|
| weight = tf.constant(
|
| weight_vector, dtype=next_states.dtype, shape=next_states[0].shape)
|
| weights = tf.expand_dims(weight, 0)
|
| offset = tf.constant(
|
| offset_vector, dtype=next_states.dtype, shape=next_states[0].shape)
|
| offsets = tf.expand_dims(offset, 0)
|
| if weight_index is not None:
|
| weights *= contexts[weight_index]
|
| rewards = tf.to_float(tf.reduce_sum(weights * (next_states+offsets), axis=1))
|
| if summarize:
|
| with tf.name_scope('RewardFn/'):
|
| summarize_stats(stats)
|
| return rewards, tf.ones_like(rewards)
|
|
|