|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| """Evaluation utility functions.
|
| """
|
|
|
| from __future__ import absolute_import
|
| from __future__ import division
|
| from __future__ import print_function
|
| import numpy as np
|
| import tensorflow as tf
|
| from collections import namedtuple
|
| logging = tf.logging
|
| import gin.tf
|
|
|
|
|
| @gin.configurable
|
| def evaluate_checkpoint_repeatedly(checkpoint_dir,
|
| evaluate_checkpoint_fn,
|
| eval_interval_secs=600,
|
| max_number_of_evaluations=None,
|
| checkpoint_timeout=None,
|
| timeout_fn=None):
|
| """Evaluates a checkpointed model at a set interval."""
|
| if max_number_of_evaluations is not None and max_number_of_evaluations <= 0:
|
| raise ValueError(
|
| '`max_number_of_evaluations` must be either None or a positive number.')
|
|
|
| number_of_evaluations = 0
|
| for checkpoint_path in tf.contrib.training.checkpoints_iterator(
|
| checkpoint_dir,
|
| min_interval_secs=eval_interval_secs,
|
| timeout=checkpoint_timeout,
|
| timeout_fn=timeout_fn):
|
| retries = 3
|
| for _ in range(retries):
|
| try:
|
| should_stop = evaluate_checkpoint_fn(checkpoint_path)
|
| break
|
| except tf.errors.DataLossError as e:
|
| logging.warn(
|
| 'Encountered a DataLossError while evaluating a checkpoint. This '
|
| 'can happen when reading a checkpoint before it is fully written. '
|
| 'Retrying...'
|
| )
|
| time.sleep(2.0)
|
|
|
|
|
| def compute_model_loss(sess, model_rollout_fn, states, actions):
|
| """Computes model loss."""
|
| preds, losses = [], []
|
| preds.append(states[0])
|
| losses.append(0)
|
| for state, action in zip(states[1:], actions[1:]):
|
| pred = model_rollout_fn(sess, preds[-1], action)
|
| loss = np.sqrt(np.sum((state - pred) ** 2))
|
| preds.append(pred)
|
| losses.append(loss)
|
| return preds, losses
|
|
|
|
|
| def compute_average_reward(sess, env_base, step_fn, gamma, num_steps,
|
| num_episodes):
|
| """Computes the discounted reward for a given number of steps.
|
|
|
| Args:
|
| sess: The tensorflow session.
|
| env_base: A python environment.
|
| step_fn: A function that takes in `sess` and returns a list of
|
| [state, action, reward, discount, transition_type] values.
|
| gamma: discounting factor to apply to the reward.
|
| num_steps: number of steps to compute the reward over.
|
| num_episodes: number of episodes to average the reward over.
|
| Returns:
|
| average_reward: a scalar of discounted reward.
|
| last_reward: last reward received.
|
| """
|
| average_reward = 0
|
| average_last_reward = 0
|
| average_meta_reward = 0
|
| average_last_meta_reward = 0
|
| average_success = 0.
|
| states, actions = None, None
|
| for i in range(num_episodes):
|
| env_base.end_episode()
|
| env_base.begin_episode()
|
| (reward, last_reward, meta_reward, last_meta_reward,
|
| states, actions) = compute_reward(
|
| sess, step_fn, gamma, num_steps)
|
| s_reward = last_meta_reward
|
| success = (s_reward > -5.0)
|
| logging.info('Episode = %d, reward = %s, meta_reward = %f, '
|
| 'last_reward = %s, last meta_reward = %f, success = %s',
|
| i, reward, meta_reward, last_reward, last_meta_reward,
|
| success)
|
| average_reward += reward
|
| average_last_reward += last_reward
|
| average_meta_reward += meta_reward
|
| average_last_meta_reward += last_meta_reward
|
| average_success += success
|
| average_reward /= num_episodes
|
| average_last_reward /= num_episodes
|
| average_meta_reward /= num_episodes
|
| average_last_meta_reward /= num_episodes
|
| average_success /= num_episodes
|
| return (average_reward, average_last_reward,
|
| average_meta_reward, average_last_meta_reward,
|
| average_success,
|
| states, actions)
|
|
|
|
|
| def compute_reward(sess, step_fn, gamma, num_steps):
|
| """Computes the discounted reward for a given number of steps.
|
|
|
| Args:
|
| sess: The tensorflow session.
|
| step_fn: A function that takes in `sess` and returns a list of
|
| [state, action, reward, discount, transition_type] values.
|
| gamma: discounting factor to apply to the reward.
|
| num_steps: number of steps to compute the reward over.
|
| Returns:
|
| reward: cumulative discounted reward.
|
| last_reward: reward received at final step.
|
| """
|
|
|
| total_reward = 0
|
| total_meta_reward = 0
|
| gamma_step = 1
|
| states = []
|
| actions = []
|
| for _ in range(num_steps):
|
| state, action, transition_type, reward, meta_reward, discount, _, _ = step_fn(sess)
|
| total_reward += reward * gamma_step * discount
|
| total_meta_reward += meta_reward * gamma_step * discount
|
| gamma_step *= gamma
|
| states.append(state)
|
| actions.append(action)
|
| return (total_reward, reward, total_meta_reward, meta_reward,
|
| states, actions)
|
|
|