| import tensorflow as tf |
| from baselines.common import tf_util |
| from baselines.a2c.utils import fc |
| from baselines.common.distributions import make_pdtype |
| from baselines.common.input import observation_placeholder, encode_observation |
| from baselines.common.tf_util import adjust_shape |
| from baselines.common.mpi_running_mean_std import RunningMeanStd |
| from baselines.common.models import get_network_builder |
|
|
| import gym |
| import numpy as np |
|
|
| np.set_printoptions(threshold=np.inf) |
|
|
|
|
| class PolicyWithValue(object): |
| """ |
| Encapsulates fields and methods for RL policy and value function estimation with shared parameters |
| """ |
|
|
| def __init__(self, env, observations, latent, skill_latent, encoding_indices, vq_latent, pure_vq_latent, |
| vq_embeddings, pure_latent, |
| estimate_q=False, vf_latent=None, |
| sess=None, **tensors): |
| """ |
| Parameters: |
| ---------- |
| env RL environment |
| |
| observations tensorflow placeholder in which the observations will be fed |
| |
| latent latent state from which policy distribution parameters should be inferred |
| |
| vf_latent latent state from which value function should be inferred (if None, then latent is used) |
| |
| skill_latent latent state from which 2D representation should be inferred (if None, then latent is used) |
| |
| skill_loss the loss of 2D encoding |
| |
| vq_latent latent state from which vq encoding should be inferred (if None, then latent is used) |
| |
| vq_loss the loss of the vq encoding |
| |
| sess tensorflow session to run calculations in (if None, default session is used) |
| |
| **tensors tensorflow tensors for additional attributes such as state or mask |
| |
| """ |
|
|
| self.X = observations |
| self.pure_latent = pure_latent |
| self.vq_latent = vq_latent |
| self.pure_vq_latent = pure_vq_latent |
| self.vq_embeddings = vq_embeddings |
| self.ei = encoding_indices |
| self.state = tf.constant([]) |
| self.initial_state = None |
| self.__dict__.update(tensors) |
|
|
| vf_latent = vf_latent if vf_latent is not None else latent |
|
|
| vf_latent = tf.compat.v1.layers.flatten(vf_latent) |
| self.latent = latent = tf.compat.v1.layers.flatten(latent) |
| self.skill_latent = skill_latent = tf.compat.v1.layers.flatten(skill_latent) |
| |
|
|
| |
| self.pdtype = make_pdtype(env.action_space) |
|
|
| self.pd, self.pi = self.pdtype.pdfromlatent(latent, init_scale=0.01) |
|
|
| |
| self.action = self.pd.sample() |
|
|
| |
| self.neglogp = self.pd.neglogp(self.action) |
| self.sess = sess or tf.compat.v1.get_default_session() |
|
|
| if estimate_q: |
| assert isinstance(env.action_space, gym.spaces.Discrete) |
| self.q = fc(vf_latent, 'q', env.action_space.n) |
| self.vf = self.q |
| else: |
| self.vf = fc(vf_latent, 'vf', 1) |
| self.vf = self.vf[:, 0] |
|
|
| def _evaluate(self, variables, observation, **extra_feed): |
| sess = self.sess |
| feed_dict = {self.X: adjust_shape(self.X, observation)} |
| for inpt_name, data in extra_feed.items(): |
| if inpt_name in self.__dict__.keys(): |
| inpt = self.__dict__[inpt_name] |
| if isinstance(inpt, tf.Tensor) and inpt._op.type == 'Placeholder': |
| feed_dict[inpt] = adjust_shape(inpt, data) |
|
|
| return sess.run(variables, feed_dict) |
|
|
| def step(self, observation, **extra_feed): |
| """ |
| Compute next action(s) given the observation(s) |
| |
| Parameters: |
| ---------- |
| |
| observation observation data (either single or a batch) |
| |
| train_skill if needing to train skills |
| |
| **extra_feed additional data such as state or mask (names of the arguments should match the ones in constructor, see __init__) |
| |
| Returns: |
| ------- |
| (action, value estimate, next state, negative log likelihood of the action under current policy parameters) tuple |
| """ |
|
|
| a, v, pure_latent, vq_latent, pure_vq_latent, vq_embeddings, encoding_indices, state, neglogp, sl, lat = self._evaluate( |
| [self.action, self.vf, self.pure_latent, self.vq_latent, self.pure_vq_latent, self.vq_embeddings, self.ei, |
| self.state, self.neglogp, self.skill_latent, self.latent], observation, |
| **extra_feed) |
|
|
| if state.size == 0: |
| state = None |
| return a, v, encoding_indices, state, neglogp |
|
|
| def skill_step(self, observation, **extra_feed): |
| """ |
| Compute next action(s) given the observation(s) |
| |
| Parameters: |
| ---------- |
| |
| observation observation data (either single or a batch) |
| |
| train_skill if needing to train skills |
| |
| **extra_feed additional data such as state or mask (names of the arguments should match the ones in constructor, see __init__) |
| |
| Returns: |
| ------- |
| (action, value estimate, next state, negative log likelihood of the action under current policy parameters) tuple |
| """ |
|
|
| a, v, pure_latent, vq_latent, pure_vq_latent, vq_embeddings, encoding_indices, state, neglogp, sl, lat = self._evaluate( |
| [self.action, self.vf, self.pure_latent, self.vq_latent, self.pure_vq_latent, self.vq_embeddings, self.ei, |
| self.state, self.neglogp, self.skill_latent, self.latent], observation, |
| **extra_feed) |
|
|
| return a, v, pure_latent, vq_latent, pure_vq_latent, vq_embeddings, encoding_indices, sl, lat |
|
|
| def value(self, ob, *args, **kwargs): |
| """ |
| Compute value estimate(s) given the observation(s) |
| |
| Parameters: |
| ---------- |
| |
| observation observation data (either single or a batch) |
| |
| **extra_feed additional data such as state or mask (names of the arguments should match the ones in constructor, see __init__) |
| |
| Returns: |
| ------- |
| value estimate |
| """ |
| return self._evaluate(self.vf, ob, *args, **kwargs) |
|
|
| def save(self, save_path): |
| tf_util.save_state(save_path, sess=self.sess) |
|
|
| def load(self, load_path): |
| tf_util.load_state(load_path, sess=self.sess) |
|
|
|
|
| def build_policy(env, policy_network, value_network=None, normalize_observations=False, estimate_q=False, |
| **policy_kwargs): |
| if isinstance(policy_network, str): |
| network_type = policy_network |
| policy_network = get_network_builder(network_type)(**policy_kwargs) |
|
|
| def policy_fn(nbatch=None, nsteps=None, sess=None, observ_placeholder=None): |
| ob_space = env.observation_space |
|
|
| X = observ_placeholder if observ_placeholder is not None else observation_placeholder(ob_space, |
| batch_size=nbatch) |
|
|
| extra_tensors = {} |
|
|
| if normalize_observations and X.dtype == tf.float32: |
| encoded_x, rms = _normalize_clip_observation(X) |
| extra_tensors['rms'] = rms |
| else: |
| encoded_x = X |
|
|
| encoded_x = encode_observation(ob_space, encoded_x) |
|
|
| with tf.compat.v1.variable_scope('pi', reuse=tf.compat.v1.AUTO_REUSE): |
| policy_latent = policy_network(encoded_x) |
| if isinstance(policy_latent, tuple): |
| policy_latent, skill_latent, pure_latent, vq_latent, pure_vq_latent, vq_embeddings, encoding_indices = policy_latent |
|
|
| _v_net = value_network |
|
|
| if _v_net is None or _v_net == 'shared': |
| vf_latent = policy_latent |
| else: |
| if _v_net == 'copy': |
| _v_net = policy_network |
| else: |
| assert callable(_v_net) |
|
|
| with tf.compat.v1.variable_scope('vf', reuse=tf.compat.v1.AUTO_REUSE): |
| |
| vf_latent = _v_net(encoded_x) |
|
|
| policy = PolicyWithValue( |
| env=env, |
| observations=X, |
| latent=policy_latent, |
| vf_latent=vf_latent, |
| skill_latent=skill_latent, |
| pure_latent=pure_latent, |
| vq_latent=vq_latent, |
| pure_vq_latent=pure_vq_latent, |
| vq_embeddings=vq_embeddings, |
| encoding_indices=encoding_indices, |
| sess=sess, |
| estimate_q=estimate_q, |
| **extra_tensors |
| ) |
| return policy |
|
|
| return policy_fn |
|
|
|
|
| def _normalize_clip_observation(x, clip_range=[-5.0, 5.0]): |
| rms = RunningMeanStd(shape=x.shape[1:]) |
| norm_x = tf.clip_by_value((x - rms.mean) / rms.std, min(clip_range), max(clip_range)) |
| return norm_x, rms |
|
|