leonepson commited on Nov 7, 2025

Commit

5960497

verified ·

1 Parent(s): 6876901

Upload 254 files

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitattributes +32 -0
LICENSE +21 -0
README.md +194 -12
baselines/__init__.py +0 -0
baselines/a2c/README.md +13 -0
baselines/a2c/__init__.py +0 -0
baselines/a2c/a2c.py +232 -0
baselines/a2c/runner.py +76 -0
baselines/a2c/utils.py +291 -0
baselines/acer/README.md +6 -0
baselines/acer/__init__.py +0 -0
baselines/acer/acer.py +381 -0
baselines/acer/buffer.py +156 -0
baselines/acer/defaults.py +4 -0
baselines/acer/policies.py +81 -0
baselines/acer/runner.py +61 -0
baselines/acktr/README.md +9 -0
baselines/acktr/__init__.py +0 -0
baselines/acktr/acktr.py +158 -0
baselines/acktr/defaults.py +5 -0
baselines/acktr/kfac.py +928 -0
baselines/acktr/kfac_utils.py +86 -0
baselines/acktr/utils.py +28 -0
baselines/bench/__init__.py +3 -0
baselines/bench/benchmarks.py +164 -0
baselines/bench/monitor.py +162 -0
baselines/bench/test_monitor.py +31 -0
baselines/common/__init__.py +5 -0
baselines/common/atari_wrappers.py +290 -0
baselines/common/cg.py +34 -0
baselines/common/cmd_util.py +206 -0
baselines/common/console_util.py +80 -0
baselines/common/dataset.py +60 -0
baselines/common/distributions.py +355 -0
baselines/common/input.py +65 -0
baselines/common/math_util.py +85 -0
baselines/common/misc_util.py +243 -0
baselines/common/models.py +406 -0
baselines/common/mpi_adam.py +103 -0
baselines/common/mpi_adam_optimizer.py +90 -0
baselines/common/mpi_fork.py +23 -0
baselines/common/mpi_moments.py +61 -0
baselines/common/mpi_running_mean_std.py +112 -0
baselines/common/mpi_util.py +133 -0
baselines/common/plot_util.py +434 -0
baselines/common/policies.py +186 -0
baselines/common/retro_wrappers.py +280 -0
baselines/common/runners.py +19 -0
baselines/common/running_mean_std.py +187 -0
baselines/common/schedules.py +99 -0

.gitattributes CHANGED Viewed

@@ -36,3 +36,35 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 checkpoints/sppo-fruitbot_easy_0_0_2021/checkpoints/25000000 filter=lfs diff=lfs merge=lfs -text
 checkpoints/sppo-jumper_easy_0_0_2021/checkpoints/25000000 filter=lfs diff=lfs merge=lfs -text
 checkpoints/sppo-ninja_easy_0_0_2021/checkpoints/25000000 filter=lfs diff=lfs merge=lfs -text

 checkpoints/sppo-fruitbot_easy_0_0_2021/checkpoints/25000000 filter=lfs diff=lfs merge=lfs -text
 checkpoints/sppo-jumper_easy_0_0_2021/checkpoints/25000000 filter=lfs diff=lfs merge=lfs -text
 checkpoints/sppo-ninja_easy_0_0_2021/checkpoints/25000000 filter=lfs diff=lfs merge=lfs -text
+baselines/gail/result/halfcheetah-training.png filter=lfs diff=lfs merge=lfs -text
+baselines/gail/result/hopper-training.png filter=lfs diff=lfs merge=lfs -text
+baselines/gail/result/humanoid-training.png filter=lfs diff=lfs merge=lfs -text
+baselines/gail/result/humanoidstandup-training.png filter=lfs diff=lfs merge=lfs -text
+baselines/gail/result/walker2d-training.png filter=lfs diff=lfs merge=lfs -text
+train_procgen/checkpoints/sppo-fruitbot_easy_0_0_2021/checkpoints/25000000 filter=lfs diff=lfs merge=lfs -text
+train_procgen/checkpoints/sppo-jumper_easy_0_0_2021/checkpoints/25000000 filter=lfs diff=lfs merge=lfs -text
+train_procgen/checkpoints/sppo-ninja_easy_0_0_2021/checkpoints/25000000 filter=lfs diff=lfs merge=lfs -text
+videos/fruitbot_skills/cluster_0.gif filter=lfs diff=lfs merge=lfs -text
+videos/fruitbot_skills/cluster_1.gif filter=lfs diff=lfs merge=lfs -text
+videos/fruitbot_skills/cluster_2.gif filter=lfs diff=lfs merge=lfs -text
+videos/fruitbot_skills/cluster_3.gif filter=lfs diff=lfs merge=lfs -text
+videos/fruitbot_skills/cluster_4.gif filter=lfs diff=lfs merge=lfs -text
+videos/fruitbot_skills/cluster_5.gif filter=lfs diff=lfs merge=lfs -text
+videos/fruitbot_skills/cluster_6.gif filter=lfs diff=lfs merge=lfs -text
+videos/fruitbot_skills/cluster_7.gif filter=lfs diff=lfs merge=lfs -text
+videos/jumper_skills/cluster_0.gif filter=lfs diff=lfs merge=lfs -text
+videos/jumper_skills/cluster_1.gif filter=lfs diff=lfs merge=lfs -text
+videos/jumper_skills/cluster_2.gif filter=lfs diff=lfs merge=lfs -text
+videos/jumper_skills/cluster_3.gif filter=lfs diff=lfs merge=lfs -text
+videos/jumper_skills/cluster_4.gif filter=lfs diff=lfs merge=lfs -text
+videos/jumper_skills/cluster_5.gif filter=lfs diff=lfs merge=lfs -text
+videos/jumper_skills/cluster_6.gif filter=lfs diff=lfs merge=lfs -text
+videos/jumper_skills/cluster_7.gif filter=lfs diff=lfs merge=lfs -text
+videos/ninja_skills/cluster_0.gif filter=lfs diff=lfs merge=lfs -text
+videos/ninja_skills/cluster_1.gif filter=lfs diff=lfs merge=lfs -text
+videos/ninja_skills/cluster_2.gif filter=lfs diff=lfs merge=lfs -text
+videos/ninja_skills/cluster_3.gif filter=lfs diff=lfs merge=lfs -text
+videos/ninja_skills/cluster_4.gif filter=lfs diff=lfs merge=lfs -text
+videos/ninja_skills/cluster_5.gif filter=lfs diff=lfs merge=lfs -text
+videos/ninja_skills/cluster_6.gif filter=lfs diff=lfs merge=lfs -text
+videos/ninja_skills/cluster_7.gif filter=lfs diff=lfs merge=lfs -text

LICENSE ADDED Viewed

	@@ -0,0 +1,21 @@

+The MIT License
+Copyright (c) 2019 OpenAI (http://openai.com)
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.

README.md CHANGED Viewed

@@ -1,12 +1,194 @@
----
-license: mit
-language:
-- en
-pipeline_tag: reinforcement-learning
-tags:
-- clustering
-- interpretablity
-- visualization
-- https://arxiv.org/abs/2409.17411
-paper: https://arxiv.org/abs/2409.17411
----

+# Enhancing Interpretability in Deep Reinforcement Learning through Semantic Clustering
+**Authors:** Liang Zhang, Justin Lieffers, Adarsh Pyarelal
+**Conference:** NeurIPS 2025 Main Track
+**Paper:** [arXiv:2409.17411](https://arxiv.org/abs/2409.17411)
+This repository contains the official implementation of our research on enhancing interpretability in deep reinforcement learning through semantic clustering techniques. Our work extends the [OpenAI train-procgen](https://github.com/openai/train-procgen) framework to incorporate semantic clustering methods for improved understanding and visualization of learned policies in procedural environments.
+## 📋 Abstract
+This work presents a novel approach to enhancing interpretability in deep reinforcement learning by leveraging semantic clustering techniques. We demonstrate how semantic clustering can provide insights into learned policies, enabling better understanding of agent behavior and decision-making processes in complex procedural environments.
+## 🚀 Quick Start
+### Installation
+Prerequisite: Python 3.8.
+1. **Clone the repository:**
+   ```bash
+   git clone https://github.com/ualiangzhang/semantic_rl.git
+   cd semantic_rl
+   ```
+2. **Install dependencies (Python 3.8):**
+   ```bash
+   pip install -r requirements.txt
+   ```
+3. **Install Procgen environments:**
+   Follow the installation steps in the [Procgen repository](https://github.com/openai/procgen).
+### Basic Usage
+**Train a semantic clustering model:**
+```bash
+python -m train_procgen.train_sppo --env_name <ENV_NAME> --num_levels 0 --distribution_mode easy --timesteps_per_proc 25000000 --rand_seed <RAND_SEED>
+```
+**Train a baseline model:**
+```bash
+python -m train_procgen.train_ppo --env_name <ENV_NAME> --num_levels 0 --distribution_mode easy --timesteps_per_proc 25000000 --rand_seed <RAND_SEED>
+```
+## 📊 Visualization and Analysis
+### Performance Analysis
+**Generate generalization figures for a single game:**
+```bash
+cd train_procgen
+python single_graph.py --env_name <ENV_NAME>
+# Example:
+python single_graph.py --env_name coinrun
+```
+### Semantic Clustering Visualization
+**Generate embedding space visualizations:**
+```bash
+python -m train_procgen.enjoy_sppo --env_name <ENV_NAME> --mode 1
+```
+**Generate skill demonstration videos:**
+```bash
+python -m train_procgen.enjoy_sppo --env_name <ENV_NAME> --mode 0
+```
+**Interactive cluster exploration:**
+```bash
+python -m train_procgen.hover_clusters --env_name <ENV_NAME>
+# Example:
+python -m train_procgen.hover_clusters --env_name fruitbot
+```
+## 🎮 Supported Environments
+Our implementation supports four Procgen environments:
+- **CoinRun**
+- **FruitBot**
+- **Jumper**
+- **Ninja**
+## 🎬 Semantic Clustering Demonstration
+### Ninja Environment - 8 Semantic Clusters
+The following videos demonstrate the 8 distinct semantic clusters learned by our model in the Ninja environment. Each cluster represents a different behavioral pattern and skill set:
+#### 📹 Semantic Cluster Demonstrations
+<table>
+<tr>
+<td align="center">
+<strong>Cluster 0</strong><br/>
+<img src="videos/ninja_skills/cluster_0.gif" width="180" height="180" />
+</td>
+<td align="center">
+<strong>Cluster 1</strong><br/>
+<img src="videos/ninja_skills/cluster_1.gif" width="180" height="180" />
+</td>
+<td align="center">
+<strong>Cluster 2</strong><br/>
+<img src="videos/ninja_skills/cluster_2.gif" width="180" height="180" />
+</td>
+<td align="center">
+<strong>Cluster 3</strong><br/>
+<img src="videos/ninja_skills/cluster_3.gif" width="180" height="180" />
+</td>
+</tr>
+<tr>
+<td align="center">
+<strong>Cluster 4</strong><br/>
+<img src="videos/ninja_skills/cluster_4.gif" width="180" height="180" />
+</td>
+<td align="center">
+<strong>Cluster 5</strong><br/>
+<img src="videos/ninja_skills/cluster_5.gif" width="180" height="180" />
+</td>
+<td align="center">
+<strong>Cluster 6</strong><br/>
+<img src="videos/ninja_skills/cluster_6.gif" width="180" height="180" />
+</td>
+<td align="center">
+<strong>Cluster 7</strong><br/>
+<img src="videos/ninja_skills/cluster_7.gif" width="180" height="180" />
+</td>
+</tr>
+</table>
+#### 🧭 Behavior Descriptions (Ninja)
+| Cluster | Behavior |
+|---------|----------|
+| 0 | The agent starts by walking through the first platform and then performs a high jump to reach a higher ledge. |
+| 1 | The agent makes small jumps in the middle of the scene. |
+| 2 | Two interpretations are present: (1) the agent starts from the leftmost end of the scene and walks to the starting position of Cluster 0; (2) when there are no higher ledges to jump to, the agent begins from the scene, walks over the first platform, and prepares to jump to the subsequent ledge. |
+| 3 | The agent walks on the ledge and prepares to jump to a higher ledge. |
+| 4 | After performing a high jump, the agent loses sight of the ledge below. |
+| 5 | The agent walks on the ledge and prepares to jump onto a ledge at the same height or lower. |
+| 6 | The agent executes a high jump while keeping the ledge below in sight. |
+| 7 | The agent moves towards the right edge of the scene and touches the mushroom. |
+#### 📊 Alternative: Generate Your Own Videos
+You can also generate these videos yourself using our code:
+```bash
+# Generate Ninja skill cluster videos
+python -m train_procgen.enjoy_sppo --env_name ninja --mode 0 --num_embeddings 8
+```
+**Note:** These videos showcase the distinct behavioral patterns learned by our semantic clustering approach. Each cluster demonstrates different combat strategies, movement patterns, and decision-making processes in the Ninja environment.
+## 📁 Output Structure
+```
+baseline/                # Required RL training package
+train_procgen/
+├── checkpoints/         # Trained model checkpoints
+├── figures/             # Generated visualizations and videos
+videos/                  # video clips corresponding to the clusters in the paper
+```
+## 📈 Reproducing Results
+To reproduce the results from our paper:
+1. **(Optional) Use existing checkpoints**: We have provided pre-trained checkpoints for Ninja, FruitBot, and Jumper (random seed 2021) in this repository under `train_procgen/checkpoints/`. You can skip training and directly run the visualization scripts. Otherwise, **train models** using the commands above.
+2. **Generate visualizations** using the provided scripts
+3. **Analyze results** using the interactive tools
+**Note:** Video generation may take 30-60 minutes depending on machine performance, as it ensures comprehensive exploration of all clusters.
+<!-- ## 🤝 Citation
+If you use this code in your research, please cite our paper: -->
+<!-- ```bibtex
+@article{zhang2025enhancing,
+  title={Enhancing Interpretability in Deep Reinforcement Learning through Semantic Clustering},
+  author={Zhang, Liang and Lieffers, Justin and Pyarelal, Adarsh},
+  journal={Advances in Neural Information Processing Systems},
+  year={2025}
+}
+``` -->
+## 📄 License
+This project is licensed under the MIT License - see the [LICENSE](LICENSE) file for details.
+## 🙏 Acknowledgments
+This work builds upon the [OpenAI train-procgen](https://github.com/openai/train-procgen) framework. We thank the original authors for their excellent work on procedural generation for reinforcement learning benchmarking.

baselines/__init__.py ADDED Viewed

File without changes

baselines/a2c/README.md ADDED Viewed

	@@ -0,0 +1,13 @@

+# A2C
+- Original paper: https://arxiv.org/abs/1602.01783
+- Baselines blog post: https://blog.openai.com/baselines-acktr-a2c/
+- `python -m baselines.run --alg=a2c --env=PongNoFrameskip-v4` runs the algorithm for 40M frames = 10M timesteps on an Atari Pong. See help (`-h`) for more options
+- also refer to the repo-wide [README.md](../../README.md#training-models)
+## Files
+- `run_atari`: file used to run the algorithm.
+- `policies.py`: contains the different versions of the A2C architecture (MlpPolicy, CNNPolicy, LstmPolicy...).
+- `a2c.py`: - Model : class used to initialize the step_model (sampling) and train_model (training)
+	- learn : Main entrypoint for A2C algorithm. Train a policy with given network architecture on a given environment using a2c algorithm.
+- `runner.py`: class used to generates a batch of experiences

baselines/a2c/__init__.py ADDED Viewed

File without changes

baselines/a2c/a2c.py ADDED Viewed

	@@ -0,0 +1,232 @@

+import time
+import functools
+import tensorflow as tf
+from baselines import logger
+from baselines.common import set_global_seeds, explained_variance
+from baselines.common import tf_util
+from baselines.common.policies import build_policy
+from baselines.a2c.utils import Scheduler, find_trainable_variables
+from baselines.a2c.runner import Runner
+from baselines.ppo2.ppo2 import safemean
+from collections import deque
+from tensorflow import losses
+class Model(object):
+    """
+    We use this class to :
+        __init__:
+        - Creates the step_model
+        - Creates the train_model
+        train():
+        - Make the training part (feedforward and retropropagation of gradients)
+        save/load():
+        - Save load the model
+    """
+    def __init__(self, policy, env, nsteps,
+            ent_coef=0.01, vf_coef=0.5, max_grad_norm=0.5, lr=7e-4,
+            alpha=0.99, epsilon=1e-5, total_timesteps=int(80e6), lrschedule='linear'):
+        sess = tf_util.get_session()
+        nenvs = env.num_envs
+        nbatch = nenvs*nsteps
+        with tf.compat.v1.variable_scope('a2c_model', reuse=tf.compat.v1.AUTO_REUSE):
+            # step_model is used for sampling
+            step_model = policy(nenvs, 1, sess)
+            # train_model is used to train our network
+            train_model = policy(nbatch, nsteps, sess)
+        A = tf.compat.v1.placeholder(train_model.action.dtype, train_model.action.shape)
+        ADV = tf.compat.v1.placeholder(tf.float32, [nbatch])
+        R = tf.compat.v1.placeholder(tf.float32, [nbatch])
+        LR = tf.compat.v1.placeholder(tf.float32, [])
+        # Calculate the loss
+        # Total loss = Policy gradient loss - entropy * entropy coefficient + Value coefficient * value loss
+        # Policy loss
+        neglogpac = train_model.pd.neglogp(A)
+        # L = A(s,a) * -logpi(a|s)
+        pg_loss = tf.reduce_mean(input_tensor=ADV * neglogpac)
+        # Entropy is used to improve exploration by limiting the premature convergence to suboptimal policy.
+        entropy = tf.reduce_mean(input_tensor=train_model.pd.entropy())
+        # Value loss
+        vf_loss = losses.mean_squared_error(tf.squeeze(train_model.vf), R)
+        loss = pg_loss - entropy*ent_coef + vf_loss * vf_coef
+        # Update parameters using loss
+        # 1. Get the model parameters
+        params = find_trainable_variables("a2c_model")
+        # 2. Calculate the gradients
+        grads = tf.gradients(ys=loss, xs=params)
+        if max_grad_norm is not None:
+            # Clip the gradients (normalize)
+            grads, grad_norm = tf.clip_by_global_norm(grads, max_grad_norm)
+        grads = list(zip(grads, params))
+        # zip aggregate each gradient with parameters associated
+        # For instance zip(ABCD, xyza) => Ax, By, Cz, Da
+        # 3. Make op for one policy and value update step of A2C
+        trainer = tf.compat.v1.train.RMSPropOptimizer(learning_rate=LR, decay=alpha, epsilon=epsilon)
+        _train = trainer.apply_gradients(grads)
+        lr = Scheduler(v=lr, nvalues=total_timesteps, schedule=lrschedule)
+        def train(obs, states, rewards, masks, actions, values):
+            # Here we calculate advantage A(s,a) = R + yV(s') - V(s)
+            # rewards = R + yV(s')
+            advs = rewards - values
+            for step in range(len(obs)):
+                cur_lr = lr.value()
+            td_map = {train_model.X:obs, A:actions, ADV:advs, R:rewards, LR:cur_lr}
+            if states is not None:
+                td_map[train_model.S] = states
+                td_map[train_model.M] = masks
+            policy_loss, value_loss, policy_entropy, _ = sess.run(
+                [pg_loss, vf_loss, entropy, _train],
+                td_map
+            )
+            return policy_loss, value_loss, policy_entropy
+        self.train = train
+        self.train_model = train_model
+        self.step_model = step_model
+        self.step = step_model.step
+        self.value = step_model.value
+        self.initial_state = step_model.initial_state
+        self.save = functools.partial(tf_util.save_variables, sess=sess)
+        self.load = functools.partial(tf_util.load_variables, sess=sess)
+        tf.compat.v1.global_variables_initializer().run(session=sess)
+def learn(
+    network,
+    env,
+    seed=None,
+    nsteps=5,
+    total_timesteps=int(80e6),
+    vf_coef=0.5,
+    ent_coef=0.01,
+    max_grad_norm=0.5,
+    lr=7e-4,
+    lrschedule='linear',
+    epsilon=1e-5,
+    alpha=0.99,
+    gamma=0.99,
+    log_interval=100,
+    load_path=None,
+    **network_kwargs):
+    '''
+    Main entrypoint for A2C algorithm. Train a policy with given network architecture on a given environment using a2c algorithm.
+    Parameters:
+    -----------
+    network:            policy network architecture. Either string (mlp, lstm, lnlstm, cnn_lstm, cnn, cnn_small, conv_only - see baselines.common/models.py for full list)
+                        specifying the standard network architecture, or a function that takes tensorflow tensor as input and returns
+                        tuple (output_tensor, extra_feed) where output tensor is the last network layer output, extra_feed is None for feed-forward
+                        neural nets, and extra_feed is a dictionary describing how to feed state into the network for recurrent neural nets.
+                        See baselines.common/policies.py/lstm for more details on using recurrent nets in policies
+    env:                RL environment. Should implement interface similar to VecEnv (baselines.common/vec_env) or be wrapped with DummyVecEnv (baselines.common/vec_env/dummy_vec_env.py)
+    seed:               seed to make random number sequence in the alorightm reproducible. By default is None which means seed from system noise generator (not reproducible)
+    nsteps:             int, number of steps of the vectorized environment per update (i.e. batch size is nsteps * nenv where
+                        nenv is number of environment copies simulated in parallel)
+    total_timesteps:    int, total number of timesteps to train on (default: 80M)
+    vf_coef:            float, coefficient in front of value function loss in the total loss function (default: 0.5)
+    ent_coef:           float, coeffictiant in front of the policy entropy in the total loss function (default: 0.01)
+    max_gradient_norm:  float, gradient is clipped to have global L2 norm no more than this value (default: 0.5)
+    lr:                 float, learning rate for RMSProp (current implementation has RMSProp hardcoded in) (default: 7e-4)
+    lrschedule:         schedule of learning rate. Can be 'linear', 'constant', or a function [0..1] -> [0..1] that takes fraction of the training progress as input and
+                        returns fraction of the learning rate (specified as lr) as output
+    epsilon:            float, RMSProp epsilon (stabilizes square root computation in denominator of RMSProp update) (default: 1e-5)
+    alpha:              float, RMSProp decay parameter (default: 0.99)
+    gamma:              float, reward discounting parameter (default: 0.99)
+    log_interval:       int, specifies how frequently the logs are printed out (default: 100)
+    **network_kwargs:   keyword arguments to the policy / network builder. See baselines.common/policies.py/build_policy and arguments to a particular type of network
+                        For instance, 'mlp' network architecture has arguments num_hidden and num_layers.
+    '''
+    set_global_seeds(seed)
+    # Get the nb of env
+    nenvs = env.num_envs
+    policy = build_policy(env, network, **network_kwargs)
+    # Instantiate the model object (that creates step_model and train_model)
+    model = Model(policy=policy, env=env, nsteps=nsteps, ent_coef=ent_coef, vf_coef=vf_coef,
+        max_grad_norm=max_grad_norm, lr=lr, alpha=alpha, epsilon=epsilon, total_timesteps=total_timesteps, lrschedule=lrschedule)
+    if load_path is not None:
+        model.load(load_path)
+    # Instantiate the runner object
+    runner = Runner(env, model, nsteps=nsteps, gamma=gamma)
+    epinfobuf = deque(maxlen=100)
+    # Calculate the batch_size
+    nbatch = nenvs*nsteps
+    # Start total timer
+    tstart = time.time()
+    for update in range(1, total_timesteps//nbatch+1):
+        # Get mini batch of experiences
+        obs, states, rewards, masks, actions, values, epinfos = runner.run()
+        epinfobuf.extend(epinfos)
+        policy_loss, value_loss, policy_entropy = model.train(obs, states, rewards, masks, actions, values)
+        nseconds = time.time()-tstart
+        # Calculate the fps (frame per second)
+        fps = int((update*nbatch)/nseconds)
+        if update % log_interval == 0 or update == 1:
+            # Calculates if value function is a good predicator of the returns (ev > 1)
+            # or if it's just worse than predicting nothing (ev =< 0)
+            ev = explained_variance(values, rewards)
+            logger.record_tabular("nupdates", update)
+            logger.record_tabular("total_timesteps", update*nbatch)
+            logger.record_tabular("fps", fps)
+            logger.record_tabular("policy_entropy", float(policy_entropy))
+            logger.record_tabular("value_loss", float(value_loss))
+            logger.record_tabular("explained_variance", float(ev))
+            logger.record_tabular("eprewmean", safemean([epinfo['r'] for epinfo in epinfobuf]))
+            logger.record_tabular("eplenmean", safemean([epinfo['l'] for epinfo in epinfobuf]))
+            logger.dump_tabular()
+    return model

baselines/a2c/runner.py ADDED Viewed

	@@ -0,0 +1,76 @@

+import numpy as np
+from baselines.a2c.utils import discount_with_dones
+from baselines.common.runners import AbstractEnvRunner
+class Runner(AbstractEnvRunner):
+    """
+    We use this class to generate batches of experiences
+    __init__:
+    - Initialize the runner
+    run():
+    - Make a mini batch of experiences
+    """
+    def __init__(self, env, model, nsteps=5, gamma=0.99):
+        super().__init__(env=env, model=model, nsteps=nsteps)
+        self.gamma = gamma
+        self.batch_action_shape = [x if x is not None else -1 for x in model.train_model.action.shape.as_list()]
+        self.ob_dtype = model.train_model.X.dtype.as_numpy_dtype
+    def run(self):
+        # We initialize the lists that will contain the mb of experiences
+        mb_obs, mb_rewards, mb_actions, mb_values, mb_dones = [],[],[],[],[]
+        mb_states = self.states
+        epinfos = []
+        for n in range(self.nsteps):
+            # Given observations, take action and value (V(s))
+            # We already have self.obs because Runner superclass run self.obs[:] = env.reset() on init
+            actions, values, states, _ = self.model.step(self.obs, S=self.states, M=self.dones)
+            # Append the experiences
+            mb_obs.append(np.copy(self.obs))
+            mb_actions.append(actions)
+            mb_values.append(values)
+            mb_dones.append(self.dones)
+            # Take actions in env and look the results
+            obs, rewards, dones, infos = self.env.step(actions)
+            for info in infos:
+                maybeepinfo = info.get('episode')
+                if maybeepinfo: epinfos.append(maybeepinfo)
+            self.states = states
+            self.dones = dones
+            self.obs = obs
+            mb_rewards.append(rewards)
+        mb_dones.append(self.dones)
+        # Batch of steps to batch of rollouts
+        mb_obs = np.asarray(mb_obs, dtype=self.ob_dtype).swapaxes(1, 0).reshape(self.batch_ob_shape)
+        mb_rewards = np.asarray(mb_rewards, dtype=np.float32).swapaxes(1, 0)
+        mb_actions = np.asarray(mb_actions, dtype=self.model.train_model.action.dtype.name).swapaxes(1, 0)
+        mb_values = np.asarray(mb_values, dtype=np.float32).swapaxes(1, 0)
+        mb_dones = np.asarray(mb_dones, dtype=np.bool).swapaxes(1, 0)
+        mb_masks = mb_dones[:, :-1]
+        mb_dones = mb_dones[:, 1:]
+        if self.gamma > 0.0:
+            # Discount/bootstrap off value fn
+            last_values = self.model.value(self.obs, S=self.states, M=self.dones).tolist()
+            for n, (rewards, dones, value) in enumerate(zip(mb_rewards, mb_dones, last_values)):
+                rewards = rewards.tolist()
+                dones = dones.tolist()
+                if dones[-1] == 0:
+                    rewards = discount_with_dones(rewards+[value], dones+[0], self.gamma)[:-1]
+                else:
+                    rewards = discount_with_dones(rewards, dones, self.gamma)
+                mb_rewards[n] = rewards
+        mb_actions = mb_actions.reshape(self.batch_action_shape)
+        mb_rewards = mb_rewards.flatten()
+        mb_values = mb_values.flatten()
+        mb_masks = mb_masks.flatten()
+        return mb_obs, mb_states, mb_rewards, mb_masks, mb_actions, mb_values, epinfos

baselines/a2c/utils.py ADDED Viewed

	@@ -0,0 +1,291 @@

+import os
+import numpy as np
+import tensorflow as tf
+from collections import deque
+def sample(logits):
+    noise = tf.random.uniform(tf.shape(input=logits))
+    return tf.argmax(input=logits - tf.math.log(-tf.math.log(noise)), axis=1)
+def cat_entropy(logits):
+    a0 = logits - tf.reduce_max(input_tensor=logits, axis=1, keepdims=True)
+    ea0 = tf.exp(a0)
+    z0 = tf.reduce_sum(input_tensor=ea0, axis=1, keepdims=True)
+    p0 = ea0 / z0
+    return tf.reduce_sum(input_tensor=p0 * (tf.math.log(z0) - a0), axis=1)
+def cat_entropy_softmax(p0):
+    return - tf.reduce_sum(input_tensor=p0 * tf.math.log(p0 + 1e-6), axis = 1)
+def ortho_init(scale=1.0):
+    def _ortho_init(shape, dtype, partition_info=None):
+        #lasagne ortho init for tf
+        shape = tuple(shape)
+        if len(shape) == 2:
+            flat_shape = shape
+        elif len(shape) == 4: # assumes NHWC
+            flat_shape = (np.prod(shape[:-1]), shape[-1])
+        else:
+            raise NotImplementedError
+        a = np.random.normal(0.0, 1.0, flat_shape)
+        u, _, v = np.linalg.svd(a, full_matrices=False)
+        q = u if u.shape == flat_shape else v # pick the one with the correct shape
+        q = q.reshape(shape)
+        return (scale * q[:shape[0], :shape[1]]).astype(np.float32)
+    return _ortho_init
+def conv(x, scope, *, nf, rf, stride, pad='VALID', init_scale=1.0, data_format='NHWC', one_dim_bias=False):
+    if data_format == 'NHWC':
+        channel_ax = 3
+        strides = [1, stride, stride, 1]
+        bshape = [1, 1, 1, nf]
+    elif data_format == 'NCHW':
+        channel_ax = 1
+        strides = [1, 1, stride, stride]
+        bshape = [1, nf, 1, 1]
+    else:
+        raise NotImplementedError
+    bias_var_shape = [nf] if one_dim_bias else [1, nf, 1, 1]
+    try:
+        nin = x.get_shape()[channel_ax].value
+    except:
+        nin = x.get_shape()[channel_ax]
+    wshape = [rf, rf, nin, nf]
+    with tf.compat.v1.variable_scope(scope):
+        w = tf.compat.v1.get_variable("w", wshape, initializer=ortho_init(init_scale))
+        b = tf.compat.v1.get_variable("b", bias_var_shape, initializer=tf.compat.v1.constant_initializer(0.0))
+        if not one_dim_bias and data_format == 'NHWC':
+            b = tf.reshape(b, bshape)
+        return tf.nn.conv2d(input=x, filters=w, strides=strides, padding=pad, data_format=data_format) + b
+def fc(x, scope, nh, *, init_scale=1.0, init_bias=0.0):
+    with tf.compat.v1.variable_scope(scope):
+        try:
+            nin = x.get_shape()[1].value
+        except:
+            nin = x.get_shape()[1]
+        w = tf.compat.v1.get_variable("w", [nin, nh], initializer=ortho_init(init_scale))
+        b = tf.compat.v1.get_variable("b", [nh], initializer=tf.compat.v1.constant_initializer(init_bias))
+        return tf.matmul(x, w)+b
+def batch_to_seq(h, nbatch, nsteps, flat=False):
+    if flat:
+        h = tf.reshape(h, [nbatch, nsteps])
+    else:
+        h = tf.reshape(h, [nbatch, nsteps, -1])
+    return [tf.squeeze(v, [1]) for v in tf.split(axis=1, num_or_size_splits=nsteps, value=h)]
+def seq_to_batch(h, flat = False):
+    shape = h[0].get_shape().as_list()
+    if not flat:
+        assert(len(shape) > 1)
+        nh = h[0].get_shape()[-1].value
+        return tf.reshape(tf.concat(axis=1, values=h), [-1, nh])
+    else:
+        return tf.reshape(tf.stack(values=h, axis=1), [-1])
+def lstm(xs, ms, s, scope, nh, init_scale=1.0):
+    nbatch, nin = [v.value for v in xs[0].get_shape()]
+    with tf.compat.v1.variable_scope(scope):
+        wx = tf.compat.v1.get_variable("wx", [nin, nh*4], initializer=ortho_init(init_scale))
+        wh = tf.compat.v1.get_variable("wh", [nh, nh*4], initializer=ortho_init(init_scale))
+        b = tf.compat.v1.get_variable("b", [nh*4], initializer=tf.compat.v1.constant_initializer(0.0))
+    c, h = tf.split(axis=1, num_or_size_splits=2, value=s)
+    for idx, (x, m) in enumerate(zip(xs, ms)):
+        c = c*(1-m)
+        h = h*(1-m)
+        z = tf.matmul(x, wx) + tf.matmul(h, wh) + b
+        i, f, o, u = tf.split(axis=1, num_or_size_splits=4, value=z)
+        i = tf.nn.sigmoid(i)
+        f = tf.nn.sigmoid(f)
+        o = tf.nn.sigmoid(o)
+        u = tf.tanh(u)
+        c = f*c + i*u
+        h = o*tf.tanh(c)
+        xs[idx] = h
+    s = tf.concat(axis=1, values=[c, h])
+    return xs, s
+def _ln(x, g, b, e=1e-5, axes=[1]):
+    u, s = tf.nn.moments(x=x, axes=axes, keepdims=True)
+    x = (x-u)/tf.sqrt(s+e)
+    x = x*g+b
+    return x
+def lnlstm(xs, ms, s, scope, nh, init_scale=1.0):
+    nbatch, nin = [v.value for v in xs[0].get_shape()]
+    with tf.compat.v1.variable_scope(scope):
+        wx = tf.compat.v1.get_variable("wx", [nin, nh*4], initializer=ortho_init(init_scale))
+        gx = tf.compat.v1.get_variable("gx", [nh*4], initializer=tf.compat.v1.constant_initializer(1.0))
+        bx = tf.compat.v1.get_variable("bx", [nh*4], initializer=tf.compat.v1.constant_initializer(0.0))
+        wh = tf.compat.v1.get_variable("wh", [nh, nh*4], initializer=ortho_init(init_scale))
+        gh = tf.compat.v1.get_variable("gh", [nh*4], initializer=tf.compat.v1.constant_initializer(1.0))
+        bh = tf.compat.v1.get_variable("bh", [nh*4], initializer=tf.compat.v1.constant_initializer(0.0))
+        b = tf.compat.v1.get_variable("b", [nh*4], initializer=tf.compat.v1.constant_initializer(0.0))
+        gc = tf.compat.v1.get_variable("gc", [nh], initializer=tf.compat.v1.constant_initializer(1.0))
+        bc = tf.compat.v1.get_variable("bc", [nh], initializer=tf.compat.v1.constant_initializer(0.0))
+    c, h = tf.split(axis=1, num_or_size_splits=2, value=s)
+    for idx, (x, m) in enumerate(zip(xs, ms)):
+        c = c*(1-m)
+        h = h*(1-m)
+        z = _ln(tf.matmul(x, wx), gx, bx) + _ln(tf.matmul(h, wh), gh, bh) + b
+        i, f, o, u = tf.split(axis=1, num_or_size_splits=4, value=z)
+        i = tf.nn.sigmoid(i)
+        f = tf.nn.sigmoid(f)
+        o = tf.nn.sigmoid(o)
+        u = tf.tanh(u)
+        c = f*c + i*u
+        h = o*tf.tanh(_ln(c, gc, bc))
+        xs[idx] = h
+    s = tf.concat(axis=1, values=[c, h])
+    return xs, s
+def conv_to_fc(x):
+    try:
+        nh = np.prod([v.value for v in x.get_shape()[1:]])
+    except:
+        nh = np.prod([v for v in x.get_shape()[1:]])
+    x = tf.reshape(x, [-1, nh])
+    return x
+def discount_with_dones(rewards, dones, gamma):
+    discounted = []
+    r = 0
+    for reward, done in zip(rewards[::-1], dones[::-1]):
+        r = reward + gamma*r*(1.-done) # fixed off by one bug
+        discounted.append(r)
+    return discounted[::-1]
+def find_trainable_variables(key):
+    return tf.compat.v1.trainable_variables(key)
+def make_path(f):
+    return os.makedirs(f, exist_ok=True)
+def constant(p):
+    return 1
+def linear(p):
+    return 1-p
+def middle_drop(p):
+    eps = 0.75
+    if 1-p<eps:
+        return eps*0.1
+    return 1-p
+def double_linear_con(p):
+    p *= 2
+    eps = 0.125
+    if 1-p<eps:
+        return eps
+    return 1-p
+def double_middle_drop(p):
+    eps1 = 0.75
+    eps2 = 0.25
+    if 1-p<eps1:
+        if 1-p<eps2:
+            return eps2*0.5
+        return eps1*0.1
+    return 1-p
+schedules = {
+    'linear':linear,
+    'constant':constant,
+    'double_linear_con': double_linear_con,
+    'middle_drop': middle_drop,
+    'double_middle_drop': double_middle_drop
+}
+class Scheduler(object):
+    def __init__(self, v, nvalues, schedule):
+        self.n = 0.
+        self.v = v
+        self.nvalues = nvalues
+        self.schedule = schedules[schedule]
+    def value(self):
+        current_value = self.v*self.schedule(self.n/self.nvalues)
+        self.n += 1.
+        return current_value
+    def value_steps(self, steps):
+        return self.v*self.schedule(steps/self.nvalues)
+class EpisodeStats:
+    def __init__(self, nsteps, nenvs):
+        self.episode_rewards = []
+        for i in range(nenvs):
+            self.episode_rewards.append([])
+        self.lenbuffer = deque(maxlen=40)  # rolling buffer for episode lengths
+        self.rewbuffer = deque(maxlen=40)  # rolling buffer for episode rewards
+        self.nsteps = nsteps
+        self.nenvs = nenvs
+    def feed(self, rewards, masks):
+        rewards = np.reshape(rewards, [self.nenvs, self.nsteps])
+        masks = np.reshape(masks, [self.nenvs, self.nsteps])
+        for i in range(0, self.nenvs):
+            for j in range(0, self.nsteps):
+                self.episode_rewards[i].append(rewards[i][j])
+                if masks[i][j]:
+                    l = len(self.episode_rewards[i])
+                    s = sum(self.episode_rewards[i])
+                    self.lenbuffer.append(l)
+                    self.rewbuffer.append(s)
+                    self.episode_rewards[i] = []
+    def mean_length(self):
+        if self.lenbuffer:
+            return np.mean(self.lenbuffer)
+        else:
+            return 0  # on the first params dump, no episodes are finished
+    def mean_reward(self):
+        if self.rewbuffer:
+            return np.mean(self.rewbuffer)
+        else:
+            return 0
+# For ACER
+def get_by_index(x, idx):
+    assert(len(x.get_shape()) == 2)
+    assert(len(idx.get_shape()) == 1)
+    idx_flattened = tf.range(0, x.shape[0]) * x.shape[1] + idx
+    y = tf.gather(tf.reshape(x, [-1]),  # flatten input
+                  idx_flattened)  # use flattened indices
+    return y
+def check_shape(ts,shapes):
+    i = 0
+    for (t,shape) in zip(ts,shapes):
+        assert t.get_shape().as_list()==shape, "id " + str(i) + " shape " + str(t.get_shape()) + str(shape)
+        i += 1
+def avg_norm(t):
+    return tf.reduce_mean(input_tensor=tf.sqrt(tf.reduce_sum(input_tensor=tf.square(t), axis=-1)))
+def gradient_add(g1, g2, param):
+    print([g1, g2, param.name])
+    assert (not (g1 is None and g2 is None)), param.name
+    if g1 is None:
+        return g2
+    elif g2 is None:
+        return g1
+    else:
+        return g1 + g2
+def q_explained_variance(qpred, q):
+    _, vary = tf.nn.moments(x=q, axes=[0, 1])
+    _, varpred = tf.nn.moments(x=q - qpred, axes=[0, 1])
+    check_shape([vary, varpred], [[]] * 2)
+    return 1.0 - (varpred / vary)

baselines/acer/README.md ADDED Viewed

	@@ -0,0 +1,6 @@

+# ACER
+- Original paper: https://arxiv.org/abs/1611.01224
+- `python -m baselines.run --alg=acer --env=PongNoFrameskip-v4` runs the algorithm for 40M frames = 10M timesteps on an Atari Pong. See help (`-h`) for more options.
+- also refer to the repo-wide [README.md](../../README.md#training-models)

baselines/acer/__init__.py ADDED Viewed

File without changes

baselines/acer/acer.py ADDED Viewed

	@@ -0,0 +1,381 @@

+import time
+import functools
+import numpy as np
+import tensorflow as tf
+from baselines import logger
+from baselines.common import set_global_seeds
+from baselines.common.policies import build_policy
+from baselines.common.tf_util import get_session, save_variables, load_variables
+from baselines.common.vec_env.vec_frame_stack import VecFrameStack
+from baselines.a2c.utils import batch_to_seq, seq_to_batch
+from baselines.a2c.utils import cat_entropy_softmax
+from baselines.a2c.utils import Scheduler, find_trainable_variables
+from baselines.a2c.utils import EpisodeStats
+from baselines.a2c.utils import get_by_index, check_shape, avg_norm, gradient_add, q_explained_variance
+from baselines.acer.buffer import Buffer
+from baselines.acer.runner import Runner
+# remove last step
+def strip(var, nenvs, nsteps, flat = False):
+    vars = batch_to_seq(var, nenvs, nsteps + 1, flat)
+    return seq_to_batch(vars[:-1], flat)
+def q_retrace(R, D, q_i, v, rho_i, nenvs, nsteps, gamma):
+    """
+    Calculates q_retrace targets
+    :param R: Rewards
+    :param D: Dones
+    :param q_i: Q values for actions taken
+    :param v: V values
+    :param rho_i: Importance weight for each action
+    :return: Q_retrace values
+    """
+    rho_bar = batch_to_seq(tf.minimum(1.0, rho_i), nenvs, nsteps, True)  # list of len steps, shape [nenvs]
+    rs = batch_to_seq(R, nenvs, nsteps, True)  # list of len steps, shape [nenvs]
+    ds = batch_to_seq(D, nenvs, nsteps, True)  # list of len steps, shape [nenvs]
+    q_is = batch_to_seq(q_i, nenvs, nsteps, True)
+    vs = batch_to_seq(v, nenvs, nsteps + 1, True)
+    v_final = vs[-1]
+    qret = v_final
+    qrets = []
+    for i in range(nsteps - 1, -1, -1):
+        check_shape([qret, ds[i], rs[i], rho_bar[i], q_is[i], vs[i]], [[nenvs]] * 6)
+        qret = rs[i] + gamma * qret * (1.0 - ds[i])
+        qrets.append(qret)
+        qret = (rho_bar[i] * (qret - q_is[i])) + vs[i]
+    qrets = qrets[::-1]
+    qret = seq_to_batch(qrets, flat=True)
+    return qret
+# For ACER with PPO clipping instead of trust region
+# def clip(ratio, eps_clip):
+#     # assume 0 <= eps_clip <= 1
+#     return tf.minimum(1 + eps_clip, tf.maximum(1 - eps_clip, ratio))
+class Model(object):
+    def __init__(self, policy, ob_space, ac_space, nenvs, nsteps, ent_coef, q_coef, gamma, max_grad_norm, lr,
+                 rprop_alpha, rprop_epsilon, total_timesteps, lrschedule,
+                 c, trust_region, alpha, delta):
+        sess = get_session()
+        nact = ac_space.n
+        nbatch = nenvs * nsteps
+        A = tf.compat.v1.placeholder(tf.int32, [nbatch]) # actions
+        D = tf.compat.v1.placeholder(tf.float32, [nbatch]) # dones
+        R = tf.compat.v1.placeholder(tf.float32, [nbatch]) # rewards, not returns
+        MU = tf.compat.v1.placeholder(tf.float32, [nbatch, nact]) # mu's
+        LR = tf.compat.v1.placeholder(tf.float32, [])
+        eps = 1e-6
+        step_ob_placeholder = tf.compat.v1.placeholder(dtype=ob_space.dtype, shape=(nenvs,) + ob_space.shape)
+        train_ob_placeholder = tf.compat.v1.placeholder(dtype=ob_space.dtype, shape=(nenvs*(nsteps+1),) + ob_space.shape)
+        with tf.compat.v1.variable_scope('acer_model', reuse=tf.compat.v1.AUTO_REUSE):
+            step_model = policy(nbatch=nenvs, nsteps=1, observ_placeholder=step_ob_placeholder, sess=sess)
+            train_model = policy(nbatch=nbatch, nsteps=nsteps, observ_placeholder=train_ob_placeholder, sess=sess)
+        params = find_trainable_variables("acer_model")
+        print("Params {}".format(len(params)))
+        for var in params:
+            print(var)
+        # create polyak averaged model
+        ema = tf.train.ExponentialMovingAverage(alpha)
+        ema_apply_op = ema.apply(params)
+        def custom_getter(getter, *args, **kwargs):
+            v = ema.average(getter(*args, **kwargs))
+            print(v.name)
+            return v
+        with tf.compat.v1.variable_scope("acer_model", custom_getter=custom_getter, reuse=True):
+            polyak_model = policy(nbatch=nbatch, nsteps=nsteps, observ_placeholder=train_ob_placeholder, sess=sess)
+        # Notation: (var) = batch variable, (var)s = seqeuence variable, (var)_i = variable index by action at step i
+        # action probability distributions according to train_model, polyak_model and step_model
+        # poilcy.pi is probability distribution parameters; to obtain distribution that sums to 1 need to take softmax
+        train_model_p = tf.nn.softmax(train_model.pi)
+        polyak_model_p = tf.nn.softmax(polyak_model.pi)
+        step_model_p = tf.nn.softmax(step_model.pi)
+        v = tf.reduce_sum(input_tensor=train_model_p * train_model.q, axis = -1) # shape is [nenvs * (nsteps + 1)]
+        # strip off last step
+        f, f_pol, q = map(lambda var: strip(var, nenvs, nsteps), [train_model_p, polyak_model_p, train_model.q])
+        # Get pi and q values for actions taken
+        f_i = get_by_index(f, A)
+        q_i = get_by_index(q, A)
+        # Compute ratios for importance truncation
+        rho = f / (MU + eps)
+        rho_i = get_by_index(rho, A)
+        # Calculate Q_retrace targets
+        qret = q_retrace(R, D, q_i, v, rho_i, nenvs, nsteps, gamma)
+        # Calculate losses
+        # Entropy
+        # entropy = tf.reduce_mean(strip(train_model.pd.entropy(), nenvs, nsteps))
+        entropy = tf.reduce_mean(input_tensor=cat_entropy_softmax(f))
+        # Policy Graident loss, with truncated importance sampling & bias correction
+        v = strip(v, nenvs, nsteps, True)
+        check_shape([qret, v, rho_i, f_i], [[nenvs * nsteps]] * 4)
+        check_shape([rho, f, q], [[nenvs * nsteps, nact]] * 2)
+        # Truncated importance sampling
+        adv = qret - v
+        logf = tf.math.log(f_i + eps)
+        gain_f = logf * tf.stop_gradient(adv * tf.minimum(c, rho_i))  # [nenvs * nsteps]
+        loss_f = -tf.reduce_mean(input_tensor=gain_f)
+        # Bias correction for the truncation
+        adv_bc = (q - tf.reshape(v, [nenvs * nsteps, 1]))  # [nenvs * nsteps, nact]
+        logf_bc = tf.math.log(f + eps) # / (f_old + eps)
+        check_shape([adv_bc, logf_bc], [[nenvs * nsteps, nact]]*2)
+        gain_bc = tf.reduce_sum(input_tensor=logf_bc * tf.stop_gradient(adv_bc * tf.nn.relu(1.0 - (c / (rho + eps))) * f), axis = 1) #IMP: This is sum, as expectation wrt f
+        loss_bc= -tf.reduce_mean(input_tensor=gain_bc)
+        loss_policy = loss_f + loss_bc
+        # Value/Q function loss, and explained variance
+        check_shape([qret, q_i], [[nenvs * nsteps]]*2)
+        ev = q_explained_variance(tf.reshape(q_i, [nenvs, nsteps]), tf.reshape(qret, [nenvs, nsteps]))
+        loss_q = tf.reduce_mean(input_tensor=tf.square(tf.stop_gradient(qret) - q_i)*0.5)
+        # Net loss
+        check_shape([loss_policy, loss_q, entropy], [[]] * 3)
+        loss = loss_policy + q_coef * loss_q - ent_coef * entropy
+        if trust_region:
+            g = tf.gradients(ys=- (loss_policy - ent_coef * entropy) * nsteps * nenvs, xs=f) #[nenvs * nsteps, nact]
+            # k = tf.gradients(KL(f_pol || f), f)
+            k = - f_pol / (f + eps) #[nenvs * nsteps, nact] # Directly computed gradient of KL divergence wrt f
+            k_dot_g = tf.reduce_sum(input_tensor=k * g, axis=-1)
+            adj = tf.maximum(0.0, (tf.reduce_sum(input_tensor=k * g, axis=-1) - delta) / (tf.reduce_sum(input_tensor=tf.square(k), axis=-1) + eps)) #[nenvs * nsteps]
+            # Calculate stats (before doing adjustment) for logging.
+            avg_norm_k = avg_norm(k)
+            avg_norm_g = avg_norm(g)
+            avg_norm_k_dot_g = tf.reduce_mean(input_tensor=tf.abs(k_dot_g))
+            avg_norm_adj = tf.reduce_mean(input_tensor=tf.abs(adj))
+            g = g - tf.reshape(adj, [nenvs * nsteps, 1]) * k
+            grads_f = -g/(nenvs*nsteps) # These are turst region adjusted gradients wrt f ie statistics of policy pi
+            grads_policy = tf.gradients(ys=f, xs=params, grad_ys=grads_f)
+            grads_q = tf.gradients(ys=loss_q * q_coef, xs=params)
+            grads = [gradient_add(g1, g2, param) for (g1, g2, param) in zip(grads_policy, grads_q, params)]
+            avg_norm_grads_f = avg_norm(grads_f) * (nsteps * nenvs)
+            norm_grads_q = tf.linalg.global_norm(grads_q)
+            norm_grads_policy = tf.linalg.global_norm(grads_policy)
+        else:
+            grads = tf.gradients(ys=loss, xs=params)
+        if max_grad_norm is not None:
+            grads, norm_grads = tf.clip_by_global_norm(grads, max_grad_norm)
+        grads = list(zip(grads, params))
+        trainer = tf.compat.v1.train.RMSPropOptimizer(learning_rate=LR, decay=rprop_alpha, epsilon=rprop_epsilon)
+        _opt_op = trainer.apply_gradients(grads)
+        # so when you call _train, you first do the gradient step, then you apply ema
+        with tf.control_dependencies([_opt_op]):
+            _train = tf.group(ema_apply_op)
+        lr = Scheduler(v=lr, nvalues=total_timesteps, schedule=lrschedule)
+        # Ops/Summaries to run, and their names for logging
+        run_ops = [_train, loss, loss_q, entropy, loss_policy, loss_f, loss_bc, ev, norm_grads]
+        names_ops = ['loss', 'loss_q', 'entropy', 'loss_policy', 'loss_f', 'loss_bc', 'explained_variance',
+                     'norm_grads']
+        if trust_region:
+            run_ops = run_ops + [norm_grads_q, norm_grads_policy, avg_norm_grads_f, avg_norm_k, avg_norm_g, avg_norm_k_dot_g,
+                                 avg_norm_adj]
+            names_ops = names_ops + ['norm_grads_q', 'norm_grads_policy', 'avg_norm_grads_f', 'avg_norm_k', 'avg_norm_g',
+                                     'avg_norm_k_dot_g', 'avg_norm_adj']
+        def train(obs, actions, rewards, dones, mus, states, masks, steps):
+            cur_lr = lr.value_steps(steps)
+            td_map = {train_model.X: obs, polyak_model.X: obs, A: actions, R: rewards, D: dones, MU: mus, LR: cur_lr}
+            if states is not None:
+                td_map[train_model.S] = states
+                td_map[train_model.M] = masks
+                td_map[polyak_model.S] = states
+                td_map[polyak_model.M] = masks
+            return names_ops, sess.run(run_ops, td_map)[1:]  # strip off _train
+        def _step(observation, **kwargs):
+            return step_model._evaluate([step_model.action, step_model_p, step_model.state], observation, **kwargs)
+        self.train = train
+        self.save = functools.partial(save_variables, sess=sess)
+        self.load = functools.partial(load_variables, sess=sess)
+        self.train_model = train_model
+        self.step_model = step_model
+        self._step = _step
+        self.step = self.step_model.step
+        self.initial_state = step_model.initial_state
+        tf.compat.v1.global_variables_initializer().run(session=sess)
+class Acer():
+    def __init__(self, runner, model, buffer, log_interval):
+        self.runner = runner
+        self.model = model
+        self.buffer = buffer
+        self.log_interval = log_interval
+        self.tstart = None
+        self.episode_stats = EpisodeStats(runner.nsteps, runner.nenv)
+        self.steps = None
+    def call(self, on_policy):
+        runner, model, buffer, steps = self.runner, self.model, self.buffer, self.steps
+        if on_policy:
+            enc_obs, obs, actions, rewards, mus, dones, masks = runner.run()
+            self.episode_stats.feed(rewards, dones)
+            if buffer is not None:
+                buffer.put(enc_obs, actions, rewards, mus, dones, masks)
+        else:
+            # get obs, actions, rewards, mus, dones from buffer.
+            obs, actions, rewards, mus, dones, masks = buffer.get()
+        # reshape stuff correctly
+        obs = obs.reshape(runner.batch_ob_shape)
+        actions = actions.reshape([runner.nbatch])
+        rewards = rewards.reshape([runner.nbatch])
+        mus = mus.reshape([runner.nbatch, runner.nact])
+        dones = dones.reshape([runner.nbatch])
+        masks = masks.reshape([runner.batch_ob_shape[0]])
+        names_ops, values_ops = model.train(obs, actions, rewards, dones, mus, model.initial_state, masks, steps)
+        if on_policy and (int(steps/runner.nbatch) % self.log_interval == 0):
+            logger.record_tabular("total_timesteps", steps)
+            logger.record_tabular("fps", int(steps/(time.time() - self.tstart)))
+            # IMP: In EpisodicLife env, during training, we get done=True at each loss of life, not just at the terminal state.
+            # Thus, this is mean until end of life, not end of episode.
+            # For true episode rewards, see the monitor files in the log folder.
+            logger.record_tabular("mean_episode_length", self.episode_stats.mean_length())
+            logger.record_tabular("mean_episode_reward", self.episode_stats.mean_reward())
+            for name, val in zip(names_ops, values_ops):
+                logger.record_tabular(name, float(val))
+            logger.dump_tabular()
+def learn(network, env, seed=None, nsteps=20, total_timesteps=int(80e6), q_coef=0.5, ent_coef=0.01,
+          max_grad_norm=10, lr=7e-4, lrschedule='linear', rprop_epsilon=1e-5, rprop_alpha=0.99, gamma=0.99,
+          log_interval=100, buffer_size=50000, replay_ratio=4, replay_start=10000, c=10.0,
+          trust_region=True, alpha=0.99, delta=1, load_path=None, **network_kwargs):
+    '''
+    Main entrypoint for ACER (Actor-Critic with Experience Replay) algorithm (https://arxiv.org/pdf/1611.01224.pdf)
+    Train an agent with given network architecture on a given environment using ACER.
+    Parameters:
+    ----------
+    network:            policy network architecture. Either string (mlp, lstm, lnlstm, cnn_lstm, cnn, cnn_small, conv_only - see baselines.common/models.py for full list)
+                        specifying the standard network architecture, or a function that takes tensorflow tensor as input and returns
+                        tuple (output_tensor, extra_feed) where output tensor is the last network layer output, extra_feed is None for feed-forward
+                        neural nets, and extra_feed is a dictionary describing how to feed state into the network for recurrent neural nets.
+                        See baselines.common/policies.py/lstm for more details on using recurrent nets in policies
+    env:                environment. Needs to be vectorized for parallel environment simulation.
+                        The environments produced by gym.make can be wrapped using baselines.common.vec_env.DummyVecEnv class.
+    nsteps:             int, number of steps of the vectorized environment per update (i.e. batch size is nsteps * nenv where
+                        nenv is number of environment copies simulated in parallel) (default: 20)
+    nstack:             int, size of the frame stack, i.e. number of the frames passed to the step model. Frames are stacked along channel dimension
+                        (last image dimension) (default: 4)
+    total_timesteps:    int, number of timesteps (i.e. number of actions taken in the environment) (default: 80M)
+    q_coef:             float, value function loss coefficient in the optimization objective (analog of vf_coef for other actor-critic methods)
+    ent_coef:           float, policy entropy coefficient in the optimization objective (default: 0.01)
+    max_grad_norm:      float, gradient norm clipping coefficient. If set to None, no clipping. (default: 10),
+    lr:                 float, learning rate for RMSProp (current implementation has RMSProp hardcoded in) (default: 7e-4)
+    lrschedule:         schedule of learning rate. Can be 'linear', 'constant', or a function [0..1] -> [0..1] that takes fraction of the training progress as input and
+                        returns fraction of the learning rate (specified as lr) as output
+    rprop_epsilon:      float, RMSProp epsilon (stabilizes square root computation in denominator of RMSProp update) (default: 1e-5)
+    rprop_alpha:        float, RMSProp decay parameter (default: 0.99)
+    gamma:              float, reward discounting factor (default: 0.99)
+    log_interval:       int, number of updates between logging events (default: 100)
+    buffer_size:        int, size of the replay buffer (default: 50k)
+    replay_ratio:       int, now many (on average) batches of data to sample from the replay buffer take after batch from the environment (default: 4)
+    replay_start:       int, the sampling from the replay buffer does not start until replay buffer has at least that many samples (default: 10k)
+    c:                  float, importance weight clipping factor (default: 10)
+    trust_region        bool, whether or not algorithms estimates the gradient KL divergence between the old and updated policy and uses it to determine step size  (default: True)
+    delta:              float, max KL divergence between the old policy and updated policy (default: 1)
+    alpha:              float, momentum factor in the Polyak (exponential moving average) averaging of the model parameters (default: 0.99)
+    load_path:          str, path to load the model from (default: None)
+    **network_kwargs:               keyword arguments to the policy / network builder. See baselines.common/policies.py/build_policy and arguments to a particular type of network
+                                    For instance, 'mlp' network architecture has arguments num_hidden and num_layers.
+    '''
+    print("Running Acer Simple")
+    print(locals())
+    set_global_seeds(seed)
+    if not isinstance(env, VecFrameStack):
+        env = VecFrameStack(env, 1)
+    policy = build_policy(env, network, estimate_q=True, **network_kwargs)
+    nenvs = env.num_envs
+    ob_space = env.observation_space
+    ac_space = env.action_space
+    nstack = env.nstack
+    model = Model(policy=policy, ob_space=ob_space, ac_space=ac_space, nenvs=nenvs, nsteps=nsteps,
+                  ent_coef=ent_coef, q_coef=q_coef, gamma=gamma,
+                  max_grad_norm=max_grad_norm, lr=lr, rprop_alpha=rprop_alpha, rprop_epsilon=rprop_epsilon,
+                  total_timesteps=total_timesteps, lrschedule=lrschedule, c=c,
+                  trust_region=trust_region, alpha=alpha, delta=delta)
+    if load_path is not None:
+        model.load(load_path)
+    runner = Runner(env=env, model=model, nsteps=nsteps)
+    if replay_ratio > 0:
+        buffer = Buffer(env=env, nsteps=nsteps, size=buffer_size)
+    else:
+        buffer = None
+    nbatch = nenvs*nsteps
+    acer = Acer(runner, model, buffer, log_interval)
+    acer.tstart = time.time()
+    for acer.steps in range(0, total_timesteps, nbatch): #nbatch samples, 1 on_policy call and multiple off-policy calls
+        acer.call(on_policy=True)
+        if replay_ratio > 0 and buffer.has_atleast(replay_start):
+            n = np.random.poisson(replay_ratio)
+            for _ in range(n):
+                acer.call(on_policy=False)  # no simulation steps in this
+    return model

baselines/acer/buffer.py ADDED Viewed

	@@ -0,0 +1,156 @@

+import numpy as np
+class Buffer(object):
+    # gets obs, actions, rewards, mu's, (states, masks), dones
+    def __init__(self, env, nsteps, size=50000):
+        self.nenv = env.num_envs
+        self.nsteps = nsteps
+        # self.nh, self.nw, self.nc = env.observation_space.shape
+        self.obs_shape = env.observation_space.shape
+        self.obs_dtype = env.observation_space.dtype
+        self.ac_dtype = env.action_space.dtype
+        self.nc = self.obs_shape[-1]
+        self.nstack = env.nstack
+        self.nc //= self.nstack
+        self.nbatch = self.nenv * self.nsteps
+        self.size = size // (self.nsteps)  # Each loc contains nenv * nsteps frames, thus total buffer is nenv * size frames
+        # Memory
+        self.enc_obs = None
+        self.actions = None
+        self.rewards = None
+        self.mus = None
+        self.dones = None
+        self.masks = None
+        # Size indexes
+        self.next_idx = 0
+        self.num_in_buffer = 0
+    def has_atleast(self, frames):
+        # Frames per env, so total (nenv * frames) Frames needed
+        # Each buffer loc has nenv * nsteps frames
+        return self.num_in_buffer >= (frames // self.nsteps)
+    def can_sample(self):
+        return self.num_in_buffer > 0
+    # Generate stacked frames
+    def decode(self, enc_obs, dones):
+        # enc_obs has shape [nenvs, nsteps + nstack, nh, nw, nc]
+        # dones has shape [nenvs, nsteps]
+        # returns stacked obs of shape [nenv, (nsteps + 1), nh, nw, nstack*nc]
+        return _stack_obs(enc_obs, dones,
+                          nsteps=self.nsteps)
+    def put(self, enc_obs, actions, rewards, mus, dones, masks):
+        # enc_obs [nenv, (nsteps + nstack), nh, nw, nc]
+        # actions, rewards, dones [nenv, nsteps]
+        # mus [nenv, nsteps, nact]
+        if self.enc_obs is None:
+            self.enc_obs = np.empty([self.size] + list(enc_obs.shape), dtype=self.obs_dtype)
+            self.actions = np.empty([self.size] + list(actions.shape), dtype=self.ac_dtype)
+            self.rewards = np.empty([self.size] + list(rewards.shape), dtype=np.float32)
+            self.mus = np.empty([self.size] + list(mus.shape), dtype=np.float32)
+            self.dones = np.empty([self.size] + list(dones.shape), dtype=np.bool)
+            self.masks = np.empty([self.size] + list(masks.shape), dtype=np.bool)
+        self.enc_obs[self.next_idx] = enc_obs
+        self.actions[self.next_idx] = actions
+        self.rewards[self.next_idx] = rewards
+        self.mus[self.next_idx] = mus
+        self.dones[self.next_idx] = dones
+        self.masks[self.next_idx] = masks
+        self.next_idx = (self.next_idx + 1) % self.size
+        self.num_in_buffer = min(self.size, self.num_in_buffer + 1)
+    def take(self, x, idx, envx):
+        nenv = self.nenv
+        out = np.empty([nenv] + list(x.shape[2:]), dtype=x.dtype)
+        for i in range(nenv):
+            out[i] = x[idx[i], envx[i]]
+        return out
+    def get(self):
+        # returns
+        # obs [nenv, (nsteps + 1), nh, nw, nstack*nc]
+        # actions, rewards, dones [nenv, nsteps]
+        # mus [nenv, nsteps, nact]
+        nenv = self.nenv
+        assert self.can_sample()
+        # Sample exactly one id per env. If you sample across envs, then higher correlation in samples from same env.
+        idx = np.random.randint(0, self.num_in_buffer, nenv)
+        envx = np.arange(nenv)
+        take = lambda x: self.take(x, idx, envx)  # for i in range(nenv)], axis = 0)
+        dones = take(self.dones)
+        enc_obs = take(self.enc_obs)
+        obs = self.decode(enc_obs, dones)
+        actions = take(self.actions)
+        rewards = take(self.rewards)
+        mus = take(self.mus)
+        masks = take(self.masks)
+        return obs, actions, rewards, mus, dones, masks
+def _stack_obs_ref(enc_obs, dones, nsteps):
+    nenv = enc_obs.shape[0]
+    nstack = enc_obs.shape[1] - nsteps
+    nh, nw, nc = enc_obs.shape[2:]
+    obs_dtype = enc_obs.dtype
+    obs_shape = (nh, nw, nc*nstack)
+    mask = np.empty([nsteps + nstack - 1, nenv, 1, 1, 1], dtype=np.float32)
+    obs = np.zeros([nstack, nsteps + nstack, nenv, nh, nw, nc], dtype=obs_dtype)
+    x = np.reshape(enc_obs, [nenv, nsteps + nstack, nh, nw, nc]).swapaxes(1, 0)  # [nsteps + nstack, nenv, nh, nw, nc]
+    mask[nstack-1:] = np.reshape(1.0 - dones, [nenv, nsteps, 1, 1, 1]).swapaxes(1, 0)  # keep
+    mask[:nstack-1] = 1.0
+    # y = np.reshape(1 - dones, [nenvs, nsteps, 1, 1, 1])
+    for i in range(nstack):
+        obs[-(i + 1), i:] = x
+        # obs[:,i:,:,:,-(i+1),:] = x
+        x = x[:-1] * mask
+        mask = mask[1:]
+    return np.reshape(obs[:, (nstack-1):].transpose((2, 1, 3, 4, 0, 5)), (nenv, (nsteps + 1)) + obs_shape)
+def _stack_obs(enc_obs, dones, nsteps):
+    nenv = enc_obs.shape[0]
+    nstack = enc_obs.shape[1] - nsteps
+    nc = enc_obs.shape[-1]
+    obs_ = np.zeros((nenv, nsteps + 1) + enc_obs.shape[2:-1] + (enc_obs.shape[-1] * nstack, ), dtype=enc_obs.dtype)
+    mask = np.ones((nenv, nsteps+1), dtype=enc_obs.dtype)
+    mask[:, 1:] = 1.0 - dones
+    mask = mask.reshape(mask.shape + tuple(np.ones(len(enc_obs.shape)-2, dtype=np.uint8)))
+    for i in range(nstack-1, -1, -1):
+        obs_[..., i * nc : (i + 1) * nc] = enc_obs[:, i : i + nsteps + 1, :]
+        if i < nstack-1:
+            obs_[..., i * nc : (i + 1) * nc] *= mask
+            mask[:, 1:, ...] *= mask[:, :-1, ...]
+    return obs_
+def test_stack_obs():
+    nstack = 7
+    nenv = 1
+    nsteps = 5
+    obs_shape = (2, 3, nstack)
+    enc_obs_shape = (nenv, nsteps + nstack) + obs_shape[:-1] + (1,)
+    enc_obs = np.random.random(enc_obs_shape)
+    dones = np.random.randint(low=0, high=2, size=(nenv, nsteps))
+    stacked_obs_ref = _stack_obs_ref(enc_obs, dones, nsteps=nsteps)
+    stacked_obs_test = _stack_obs(enc_obs, dones, nsteps=nsteps)
+    np.testing.assert_allclose(stacked_obs_ref, stacked_obs_test)

baselines/acer/defaults.py ADDED Viewed

	@@ -0,0 +1,4 @@

+def atari():
+    return dict(
+        lrschedule='constant'
+    )

baselines/acer/policies.py ADDED Viewed

	@@ -0,0 +1,81 @@

+import numpy as np
+import tensorflow as tf
+from baselines.common.policies import nature_cnn
+from baselines.a2c.utils import fc, batch_to_seq, seq_to_batch, lstm, sample
+class AcerCnnPolicy(object):
+    def __init__(self, sess, ob_space, ac_space, nenv, nsteps, nstack, reuse=False):
+        nbatch = nenv * nsteps
+        nh, nw, nc = ob_space.shape
+        ob_shape = (nbatch, nh, nw, nc * nstack)
+        nact = ac_space.n
+        X = tf.compat.v1.placeholder(tf.uint8, ob_shape)  # obs
+        with tf.compat.v1.variable_scope("model", reuse=reuse):
+            h = nature_cnn(X)
+            pi_logits = fc(h, 'pi', nact, init_scale=0.01)
+            pi = tf.nn.softmax(pi_logits)
+            q = fc(h, 'q', nact)
+        a = sample(tf.nn.softmax(pi_logits))  # could change this to use self.pi instead
+        self.initial_state = []  # not stateful
+        self.X = X
+        self.pi = pi  # actual policy params now
+        self.pi_logits = pi_logits
+        self.q = q
+        self.vf = q
+        def step(ob, *args, **kwargs):
+            # returns actions, mus, states
+            a0, pi0 = sess.run([a, pi], {X: ob})
+            return a0, pi0, []  # dummy state
+        def out(ob, *args, **kwargs):
+            pi0, q0 = sess.run([pi, q], {X: ob})
+            return pi0, q0
+        def act(ob, *args, **kwargs):
+            return sess.run(a, {X: ob})
+        self.step = step
+        self.out = out
+        self.act = act
+class AcerLstmPolicy(object):
+    def __init__(self, sess, ob_space, ac_space, nenv, nsteps, nstack, reuse=False, nlstm=256):
+        nbatch = nenv * nsteps
+        nh, nw, nc = ob_space.shape
+        ob_shape = (nbatch, nh, nw, nc * nstack)
+        nact = ac_space.n
+        X = tf.compat.v1.placeholder(tf.uint8, ob_shape)  # obs
+        M = tf.compat.v1.placeholder(tf.float32, [nbatch]) #mask (done t-1)
+        S = tf.compat.v1.placeholder(tf.float32, [nenv, nlstm*2]) #states
+        with tf.compat.v1.variable_scope("model", reuse=reuse):
+            h = nature_cnn(X)
+            # lstm
+            xs = batch_to_seq(h, nenv, nsteps)
+            ms = batch_to_seq(M, nenv, nsteps)
+            h5, snew = lstm(xs, ms, S, 'lstm1', nh=nlstm)
+            h5 = seq_to_batch(h5)
+            pi_logits = fc(h5, 'pi', nact, init_scale=0.01)
+            pi = tf.nn.softmax(pi_logits)
+            q = fc(h5, 'q', nact)
+        a = sample(pi_logits)  # could change this to use self.pi instead
+        self.initial_state = np.zeros((nenv, nlstm*2), dtype=np.float32)
+        self.X = X
+        self.M = M
+        self.S = S
+        self.pi = pi  # actual policy params now
+        self.q = q
+        def step(ob, state, mask, *args, **kwargs):
+            # returns actions, mus, states
+            a0, pi0, s = sess.run([a, pi, snew], {X: ob, S: state, M: mask})
+            return a0, pi0, s
+        self.step = step

baselines/acer/runner.py ADDED Viewed

	@@ -0,0 +1,61 @@

+import numpy as np
+from baselines.common.runners import AbstractEnvRunner
+from baselines.common.vec_env.vec_frame_stack import VecFrameStack
+from gym import spaces
+class Runner(AbstractEnvRunner):
+    def __init__(self, env, model, nsteps):
+        super().__init__(env=env, model=model, nsteps=nsteps)
+        assert isinstance(env.action_space, spaces.Discrete), 'This ACER implementation works only with discrete action spaces!'
+        assert isinstance(env, VecFrameStack)
+        self.nact = env.action_space.n
+        nenv = self.nenv
+        self.nbatch = nenv * nsteps
+        self.batch_ob_shape = (nenv*(nsteps+1),) + env.observation_space.shape
+        self.obs = env.reset()
+        self.obs_dtype = env.observation_space.dtype
+        self.ac_dtype = env.action_space.dtype
+        self.nstack = self.env.nstack
+        self.nc = self.batch_ob_shape[-1] // self.nstack
+    def run(self):
+        # enc_obs = np.split(self.obs, self.nstack, axis=3)  # so now list of obs steps
+        enc_obs = np.split(self.env.stackedobs, self.env.nstack, axis=-1)
+        mb_obs, mb_actions, mb_mus, mb_dones, mb_rewards = [], [], [], [], []
+        for _ in range(self.nsteps):
+            actions, mus, states = self.model._step(self.obs, S=self.states, M=self.dones)
+            mb_obs.append(np.copy(self.obs))
+            mb_actions.append(actions)
+            mb_mus.append(mus)
+            mb_dones.append(self.dones)
+            obs, rewards, dones, _ = self.env.step(actions)
+            # states information for statefull models like LSTM
+            self.states = states
+            self.dones = dones
+            self.obs = obs
+            mb_rewards.append(rewards)
+            enc_obs.append(obs[..., -self.nc:])
+        mb_obs.append(np.copy(self.obs))
+        mb_dones.append(self.dones)
+        enc_obs = np.asarray(enc_obs, dtype=self.obs_dtype).swapaxes(1, 0)
+        mb_obs = np.asarray(mb_obs, dtype=self.obs_dtype).swapaxes(1, 0)
+        mb_actions = np.asarray(mb_actions, dtype=self.ac_dtype).swapaxes(1, 0)
+        mb_rewards = np.asarray(mb_rewards, dtype=np.float32).swapaxes(1, 0)
+        mb_mus = np.asarray(mb_mus, dtype=np.float32).swapaxes(1, 0)
+        mb_dones = np.asarray(mb_dones, dtype=np.bool).swapaxes(1, 0)
+        mb_masks = mb_dones # Used for statefull models like LSTM's to mask state when done
+        mb_dones = mb_dones[:, 1:] # Used for calculating returns. The dones array is now aligned with rewards
+        # shapes are now [nenv, nsteps, []]
+        # When pulling from buffer, arrays will now be reshaped in place, preventing a deep copy.
+        return enc_obs, mb_obs, mb_actions, mb_rewards, mb_mus, mb_dones, mb_masks

baselines/acktr/README.md ADDED Viewed

	@@ -0,0 +1,9 @@

+# ACKTR
+- Original paper: https://arxiv.org/abs/1708.05144
+- Baselines blog post: https://blog.openai.com/baselines-acktr-a2c/
+- `python -m baselines.run --alg=acktr --env=PongNoFrameskip-v4` runs the algorithm for 40M frames = 10M timesteps on an Atari Pong. See help (`-h`) for more options.
+- also refer to the repo-wide [README.md](../../README.md#training-models)
+## ACKTR with continuous action spaces
+The code of ACKTR has been refactored to handle both discrete and continuous action spaces uniformly. In the original version, discrete and continuous action spaces were handled by different code (actkr_disc.py and acktr_cont.py) with little overlap. If interested in the original version of the acktr for continuous action spaces, use `old_acktr_cont` branch. Note that original code performs better on the mujoco tasks than the refactored version; we are still investigating why.

baselines/acktr/__init__.py ADDED Viewed

File without changes

baselines/acktr/acktr.py ADDED Viewed

	@@ -0,0 +1,158 @@

+import os.path as osp
+import time
+import functools
+import tensorflow as tf
+from baselines import logger
+from baselines.common import set_global_seeds, explained_variance
+from baselines.common.policies import build_policy
+from baselines.common.tf_util import get_session, save_variables, load_variables
+from baselines.a2c.runner import Runner
+from baselines.a2c.utils import Scheduler, find_trainable_variables
+from baselines.acktr import kfac
+from baselines.ppo2.ppo2 import safemean
+from collections import deque
+class Model(object):
+    def __init__(self, policy, ob_space, ac_space, nenvs,total_timesteps, nprocs=32, nsteps=20,
+                 ent_coef=0.01, vf_coef=0.5, vf_fisher_coef=1.0, lr=0.25, max_grad_norm=0.5,
+                 kfac_clip=0.001, lrschedule='linear', is_async=True):
+        self.sess = sess = get_session()
+        nbatch = nenvs * nsteps
+        with tf.compat.v1.variable_scope('acktr_model', reuse=tf.compat.v1.AUTO_REUSE):
+            self.model = step_model = policy(nenvs, 1, sess=sess)
+            self.model2 = train_model = policy(nenvs*nsteps, nsteps, sess=sess)
+        A = train_model.pdtype.sample_placeholder([None])
+        ADV = tf.compat.v1.placeholder(tf.float32, [nbatch])
+        R = tf.compat.v1.placeholder(tf.float32, [nbatch])
+        PG_LR = tf.compat.v1.placeholder(tf.float32, [])
+        VF_LR = tf.compat.v1.placeholder(tf.float32, [])
+        neglogpac = train_model.pd.neglogp(A)
+        self.logits = train_model.pi
+        ##training loss
+        pg_loss = tf.reduce_mean(input_tensor=ADV*neglogpac)
+        entropy = tf.reduce_mean(input_tensor=train_model.pd.entropy())
+        pg_loss = pg_loss - ent_coef * entropy
+        vf_loss = tf.compat.v1.losses.mean_squared_error(tf.squeeze(train_model.vf), R)
+        train_loss = pg_loss + vf_coef * vf_loss
+        ##Fisher loss construction
+        self.pg_fisher = pg_fisher_loss = -tf.reduce_mean(input_tensor=neglogpac)
+        sample_net = train_model.vf + tf.random.normal(tf.shape(input=train_model.vf))
+        self.vf_fisher = vf_fisher_loss = - vf_fisher_coef*tf.reduce_mean(input_tensor=tf.pow(train_model.vf - tf.stop_gradient(sample_net), 2))
+        self.joint_fisher = joint_fisher_loss = pg_fisher_loss + vf_fisher_loss
+        self.params=params = find_trainable_variables("acktr_model")
+        self.grads_check = grads = tf.gradients(ys=train_loss,xs=params)
+        with tf.device('/gpu:0'):
+            self.optim = optim = kfac.KfacOptimizer(learning_rate=PG_LR, clip_kl=kfac_clip,\
+                momentum=0.9, kfac_update=1, epsilon=0.01,\
+                stats_decay=0.99, is_async=is_async, cold_iter=10, max_grad_norm=max_grad_norm)
+            # update_stats_op = optim.compute_and_apply_stats(joint_fisher_loss, var_list=params)
+            optim.compute_and_apply_stats(joint_fisher_loss, var_list=params)
+            train_op, q_runner = optim.apply_gradients(list(zip(grads,params)))
+        self.q_runner = q_runner
+        self.lr = Scheduler(v=lr, nvalues=total_timesteps, schedule=lrschedule)
+        def train(obs, states, rewards, masks, actions, values):
+            advs = rewards - values
+            for step in range(len(obs)):
+                cur_lr = self.lr.value()
+            td_map = {train_model.X:obs, A:actions, ADV:advs, R:rewards, PG_LR:cur_lr, VF_LR:cur_lr}
+            if states is not None:
+                td_map[train_model.S] = states
+                td_map[train_model.M] = masks
+            policy_loss, value_loss, policy_entropy, _ = sess.run(
+                [pg_loss, vf_loss, entropy, train_op],
+                td_map
+            )
+            return policy_loss, value_loss, policy_entropy
+        self.train = train
+        self.save = functools.partial(save_variables, sess=sess)
+        self.load = functools.partial(load_variables, sess=sess)
+        self.train_model = train_model
+        self.step_model = step_model
+        self.step = step_model.step
+        self.value = step_model.value
+        self.initial_state = step_model.initial_state
+        tf.compat.v1.global_variables_initializer().run(session=sess)
+def learn(network, env, seed, total_timesteps=int(40e6), gamma=0.99, log_interval=100, nprocs=32, nsteps=20,
+                 ent_coef=0.01, vf_coef=0.5, vf_fisher_coef=1.0, lr=0.25, max_grad_norm=0.5,
+                 kfac_clip=0.001, save_interval=None, lrschedule='linear', load_path=None, is_async=True, **network_kwargs):
+    set_global_seeds(seed)
+    if network == 'cnn':
+        network_kwargs['one_dim_bias'] = True
+    policy = build_policy(env, network, **network_kwargs)
+    nenvs = env.num_envs
+    ob_space = env.observation_space
+    ac_space = env.action_space
+    make_model = lambda : Model(policy, ob_space, ac_space, nenvs, total_timesteps, nprocs=nprocs, nsteps
+                                =nsteps, ent_coef=ent_coef, vf_coef=vf_coef, vf_fisher_coef=
+                                vf_fisher_coef, lr=lr, max_grad_norm=max_grad_norm, kfac_clip=kfac_clip,
+                                lrschedule=lrschedule, is_async=is_async)
+    if save_interval and logger.get_dir():
+        import cloudpickle
+        with open(osp.join(logger.get_dir(), 'make_model.pkl'), 'wb') as fh:
+            fh.write(cloudpickle.dumps(make_model))
+    model = make_model()
+    if load_path is not None:
+        model.load(load_path)
+    runner = Runner(env, model, nsteps=nsteps, gamma=gamma)
+    epinfobuf = deque(maxlen=100)
+    nbatch = nenvs*nsteps
+    tstart = time.time()
+    coord = tf.train.Coordinator()
+    if is_async:
+        enqueue_threads = model.q_runner.create_threads(model.sess, coord=coord, start=True)
+    else:
+        enqueue_threads = []
+    for update in range(1, total_timesteps//nbatch+1):
+        obs, states, rewards, masks, actions, values, epinfos = runner.run()
+        epinfobuf.extend(epinfos)
+        policy_loss, value_loss, policy_entropy = model.train(obs, states, rewards, masks, actions, values)
+        model.old_obs = obs
+        nseconds = time.time()-tstart
+        fps = int((update*nbatch)/nseconds)
+        if update % log_interval == 0 or update == 1:
+            ev = explained_variance(values, rewards)
+            logger.record_tabular("nupdates", update)
+            logger.record_tabular("total_timesteps", update*nbatch)
+            logger.record_tabular("fps", fps)
+            logger.record_tabular("policy_entropy", float(policy_entropy))
+            logger.record_tabular("policy_loss", float(policy_loss))
+            logger.record_tabular("value_loss", float(value_loss))
+            logger.record_tabular("explained_variance", float(ev))
+            logger.record_tabular("eprewmean", safemean([epinfo['r'] for epinfo in epinfobuf]))
+            logger.record_tabular("eplenmean", safemean([epinfo['l'] for epinfo in epinfobuf]))
+            logger.dump_tabular()
+        if save_interval and (update % save_interval == 0 or update == 1) and logger.get_dir():
+            savepath = osp.join(logger.get_dir(), 'checkpoint%.5i'%update)
+            print('Saving to', savepath)
+            model.save(savepath)
+    coord.request_stop()
+    coord.join(enqueue_threads)
+    return model

baselines/acktr/defaults.py ADDED Viewed

	@@ -0,0 +1,5 @@

+def mujoco():
+    return dict(
+        nsteps=2500,
+        value_network='copy'
+    )

baselines/acktr/kfac.py ADDED Viewed

	@@ -0,0 +1,928 @@

+import tensorflow as tf
+import numpy as np
+import re
+ # flake8: noqa F403, F405
+from baselines.acktr.kfac_utils import *
+from functools import reduce
+KFAC_OPS = ['MatMul', 'Conv2D', 'BiasAdd']
+KFAC_DEBUG = False
+class KfacOptimizer():
+    # note that KfacOptimizer will be truly synchronous (and thus deterministic) only if a single-threaded session is used
+    def __init__(self, learning_rate=0.01, momentum=0.9, clip_kl=0.01, kfac_update=2, stats_accum_iter=60, full_stats_init=False, cold_iter=100, cold_lr=None, is_async=False, async_stats=False, epsilon=1e-2, stats_decay=0.95, blockdiag_bias=False, channel_fac=False, factored_damping=False, approxT2=False, use_float64=False, weight_decay_dict={},max_grad_norm=0.5):
+        self.max_grad_norm = max_grad_norm
+        self._lr = learning_rate
+        self._momentum = momentum
+        self._clip_kl = clip_kl
+        self._channel_fac = channel_fac
+        self._kfac_update = kfac_update
+        self._async = is_async
+        self._async_stats = async_stats
+        self._epsilon = epsilon
+        self._stats_decay = stats_decay
+        self._blockdiag_bias = blockdiag_bias
+        self._approxT2 = approxT2
+        self._use_float64 = use_float64
+        self._factored_damping = factored_damping
+        self._cold_iter = cold_iter
+        if cold_lr == None:
+            # good heuristics
+            self._cold_lr = self._lr# * 3.
+        else:
+            self._cold_lr = cold_lr
+        self._stats_accum_iter = stats_accum_iter
+        self._weight_decay_dict = weight_decay_dict
+        self._diag_init_coeff = 0.
+        self._full_stats_init = full_stats_init
+        if not self._full_stats_init:
+            self._stats_accum_iter = self._cold_iter
+        self.sgd_step = tf.Variable(0, name='KFAC/sgd_step', trainable=False)
+        self.global_step = tf.Variable(
+            0, name='KFAC/global_step', trainable=False)
+        self.cold_step = tf.Variable(0, name='KFAC/cold_step', trainable=False)
+        self.factor_step = tf.Variable(
+            0, name='KFAC/factor_step', trainable=False)
+        self.stats_step = tf.Variable(
+            0, name='KFAC/stats_step', trainable=False)
+        self.vFv = tf.Variable(0., name='KFAC/vFv', trainable=False)
+        self.factors = {}
+        self.param_vars = []
+        self.stats = {}
+        self.stats_eigen = {}
+    def getFactors(self, g, varlist):
+        graph = tf.compat.v1.get_default_graph()
+        factorTensors = {}
+        fpropTensors = []
+        bpropTensors = []
+        opTypes = []
+        fops = []
+        def searchFactors(gradient, graph):
+            # hard coded search stratergy
+            bpropOp = gradient.op
+            bpropOp_name = bpropOp.name
+            bTensors = []
+            fTensors = []
+            # combining additive gradient, assume they are the same op type and
+            # indepedent
+            if 'AddN' in bpropOp_name:
+                factors = []
+                for g in gradient.op.inputs:
+                    factors.append(searchFactors(g, graph))
+                op_names = [item['opName'] for item in factors]
+                # TO-DO: need to check all the attribute of the ops as well
+                print (gradient.name)
+                print (op_names)
+                print (len(np.unique(op_names)))
+                assert len(np.unique(op_names)) == 1, gradient.name + \
+                    ' is shared among different computation OPs'
+                bTensors = reduce(lambda x, y: x + y,
+                                  [item['bpropFactors'] for item in factors])
+                if len(factors[0]['fpropFactors']) > 0:
+                    fTensors = reduce(
+                        lambda x, y: x + y, [item['fpropFactors'] for item in factors])
+                fpropOp_name = op_names[0]
+                fpropOp = factors[0]['op']
+            else:
+                fpropOp_name = re.search(
+                    'gradientsSampled(_[0-9]+|)/(.+?)_grad', bpropOp_name).group(2)
+                fpropOp = graph.get_operation_by_name(fpropOp_name)
+                if fpropOp.op_def.name in KFAC_OPS:
+                    # Known OPs
+                    ###
+                    bTensor = [
+                        i for i in bpropOp.inputs if 'gradientsSampled' in i.name][-1]
+                    bTensorShape = fpropOp.outputs[0].get_shape()
+                    if bTensor.get_shape()[0].value == None:
+                        bTensor.set_shape(bTensorShape)
+                    bTensors.append(bTensor)
+                    ###
+                    if fpropOp.op_def.name == 'BiasAdd':
+                        fTensors = []
+                    else:
+                        fTensors.append(
+                            [i for i in fpropOp.inputs if param.op.name not in i.name][0])
+                    fpropOp_name = fpropOp.op_def.name
+                else:
+                    # unknown OPs, block approximation used
+                    bInputsList = [i for i in bpropOp.inputs[
+                        0].op.inputs if 'gradientsSampled' in i.name if 'Shape' not in i.name]
+                    if len(bInputsList) > 0:
+                        bTensor = bInputsList[0]
+                        bTensorShape = fpropOp.outputs[0].get_shape()
+                        if len(bTensor.get_shape()) > 0 and bTensor.get_shape()[0].value == None:
+                            bTensor.set_shape(bTensorShape)
+                        bTensors.append(bTensor)
+                    fpropOp_name = opTypes.append('UNK-' + fpropOp.op_def.name)
+            return {'opName': fpropOp_name, 'op': fpropOp, 'fpropFactors': fTensors, 'bpropFactors': bTensors}
+        for t, param in zip(g, varlist):
+            if KFAC_DEBUG:
+                print(('get factor for '+param.name))
+            factors = searchFactors(t, graph)
+            factorTensors[param] = factors
+        ########
+        # check associated weights and bias for homogeneous coordinate representation
+        # and check redundent factors
+        # TO-DO: there may be a bug to detect associate bias and weights for
+        # forking layer, e.g. in inception models.
+        for param in varlist:
+            factorTensors[param]['assnWeights'] = None
+            factorTensors[param]['assnBias'] = None
+        for param in varlist:
+            if factorTensors[param]['opName'] == 'BiasAdd':
+                factorTensors[param]['assnWeights'] = None
+                for item in varlist:
+                    if len(factorTensors[item]['bpropFactors']) > 0:
+                        if (set(factorTensors[item]['bpropFactors']) == set(factorTensors[param]['bpropFactors'])) and (len(factorTensors[item]['fpropFactors']) > 0):
+                            factorTensors[param]['assnWeights'] = item
+                            factorTensors[item]['assnBias'] = param
+                            factorTensors[param]['bpropFactors'] = factorTensors[
+                                item]['bpropFactors']
+        ########
+        ########
+        # concatenate the additive gradients along the batch dimension, i.e.
+        # assuming independence structure
+        for key in ['fpropFactors', 'bpropFactors']:
+            for i, param in enumerate(varlist):
+                if len(factorTensors[param][key]) > 0:
+                    if (key + '_concat') not in factorTensors[param]:
+                        name_scope = factorTensors[param][key][0].name.split(':')[
+                            0]
+                        with tf.compat.v1.name_scope(name_scope):
+                            factorTensors[param][
+                                key + '_concat'] = tf.concat(factorTensors[param][key], 0)
+                else:
+                    factorTensors[param][key + '_concat'] = None
+                for j, param2 in enumerate(varlist[(i + 1):]):
+                    if (len(factorTensors[param][key]) > 0) and (set(factorTensors[param2][key]) == set(factorTensors[param][key])):
+                        factorTensors[param2][key] = factorTensors[param][key]
+                        factorTensors[param2][
+                            key + '_concat'] = factorTensors[param][key + '_concat']
+        ########
+        if KFAC_DEBUG:
+            for items in zip(varlist, fpropTensors, bpropTensors, opTypes):
+                print((items[0].name, factorTensors[item]))
+        self.factors = factorTensors
+        return factorTensors
+    def getStats(self, factors, varlist):
+        if len(self.stats) == 0:
+            # initialize stats variables on CPU because eigen decomp is
+            # computed on CPU
+            with tf.device('/cpu'):
+                tmpStatsCache = {}
+                # search for tensor factors and
+                # use block diag approx for the bias units
+                for var in varlist:
+                    fpropFactor = factors[var]['fpropFactors_concat']
+                    bpropFactor = factors[var]['bpropFactors_concat']
+                    opType = factors[var]['opName']
+                    if opType == 'Conv2D':
+                        Kh = var.get_shape()[0]
+                        Kw = var.get_shape()[1]
+                        C = fpropFactor.get_shape()[-1]
+                        Oh = bpropFactor.get_shape()[1]
+                        Ow = bpropFactor.get_shape()[2]
+                        if Oh == 1 and Ow == 1 and self._channel_fac:
+                            # factorization along the channels do not support
+                            # homogeneous coordinate
+                            var_assnBias = factors[var]['assnBias']
+                            if var_assnBias:
+                                factors[var]['assnBias'] = None
+                                factors[var_assnBias]['assnWeights'] = None
+                ##
+                for var in varlist:
+                    fpropFactor = factors[var]['fpropFactors_concat']
+                    bpropFactor = factors[var]['bpropFactors_concat']
+                    opType = factors[var]['opName']
+                    self.stats[var] = {'opName': opType,
+                                       'fprop_concat_stats': [],
+                                       'bprop_concat_stats': [],
+                                       'assnWeights': factors[var]['assnWeights'],
+                                       'assnBias': factors[var]['assnBias'],
+                                       }
+                    if fpropFactor is not None:
+                        if fpropFactor not in tmpStatsCache:
+                            if opType == 'Conv2D':
+                                Kh = var.get_shape()[0]
+                                Kw = var.get_shape()[1]
+                                C = fpropFactor.get_shape()[-1]
+                                Oh = bpropFactor.get_shape()[1]
+                                Ow = bpropFactor.get_shape()[2]
+                                if Oh == 1 and Ow == 1 and self._channel_fac:
+                                    # factorization along the channels
+                                    # assume independence between input channels and spatial
+                                    # 2K-1 x 2K-1 covariance matrix and C x C covariance matrix
+                                    # factorization along the channels do not
+                                    # support homogeneous coordinate, assnBias
+                                    # is always None
+                                    fpropFactor2_size = Kh * Kw
+                                    slot_fpropFactor_stats2 = tf.Variable(tf.linalg.tensor_diag(tf.ones(
+                                        [fpropFactor2_size])) * self._diag_init_coeff, name='KFAC_STATS/' + fpropFactor.op.name, trainable=False)
+                                    self.stats[var]['fprop_concat_stats'].append(
+                                        slot_fpropFactor_stats2)
+                                    fpropFactor_size = C
+                                else:
+                                    # 2K-1 x 2K-1 x C x C covariance matrix
+                                    # assume BHWC
+                                    fpropFactor_size = Kh * Kw * C
+                            else:
+                                # D x D covariance matrix
+                                fpropFactor_size = fpropFactor.get_shape()[-1]
+                            # use homogeneous coordinate
+                            if not self._blockdiag_bias and self.stats[var]['assnBias']:
+                                fpropFactor_size += 1
+                            slot_fpropFactor_stats = tf.Variable(tf.linalg.tensor_diag(tf.ones(
+                                [fpropFactor_size])) * self._diag_init_coeff, name='KFAC_STATS/' + fpropFactor.op.name, trainable=False)
+                            self.stats[var]['fprop_concat_stats'].append(
+                                slot_fpropFactor_stats)
+                            if opType != 'Conv2D':
+                                tmpStatsCache[fpropFactor] = self.stats[
+                                    var]['fprop_concat_stats']
+                        else:
+                            self.stats[var][
+                                'fprop_concat_stats'] = tmpStatsCache[fpropFactor]
+                    if bpropFactor is not None:
+                        # no need to collect backward stats for bias vectors if
+                        # using homogeneous coordinates
+                        if not((not self._blockdiag_bias) and self.stats[var]['assnWeights']):
+                            if bpropFactor not in tmpStatsCache:
+                                slot_bpropFactor_stats = tf.Variable(tf.linalg.tensor_diag(tf.ones([bpropFactor.get_shape(
+                                )[-1]])) * self._diag_init_coeff, name='KFAC_STATS/' + bpropFactor.op.name, trainable=False)
+                                self.stats[var]['bprop_concat_stats'].append(
+                                    slot_bpropFactor_stats)
+                                tmpStatsCache[bpropFactor] = self.stats[
+                                    var]['bprop_concat_stats']
+                            else:
+                                self.stats[var][
+                                    'bprop_concat_stats'] = tmpStatsCache[bpropFactor]
+        return self.stats
+    def compute_and_apply_stats(self, loss_sampled, var_list=None):
+        varlist = var_list
+        if varlist is None:
+            varlist = tf.compat.v1.trainable_variables()
+        stats = self.compute_stats(loss_sampled, var_list=varlist)
+        return self.apply_stats(stats)
+    def compute_stats(self, loss_sampled, var_list=None):
+        varlist = var_list
+        if varlist is None:
+            varlist = tf.compat.v1.trainable_variables()
+        gs = tf.gradients(ys=loss_sampled, xs=varlist, name='gradientsSampled')
+        self.gs = gs
+        factors = self.getFactors(gs, varlist)
+        stats = self.getStats(factors, varlist)
+        updateOps = []
+        statsUpdates = {}
+        statsUpdates_cache = {}
+        for var in varlist:
+            opType = factors[var]['opName']
+            fops = factors[var]['op']
+            fpropFactor = factors[var]['fpropFactors_concat']
+            fpropStats_vars = stats[var]['fprop_concat_stats']
+            bpropFactor = factors[var]['bpropFactors_concat']
+            bpropStats_vars = stats[var]['bprop_concat_stats']
+            SVD_factors = {}
+            for stats_var in fpropStats_vars:
+                stats_var_dim = int(stats_var.get_shape()[0])
+                if stats_var not in statsUpdates_cache:
+                    old_fpropFactor = fpropFactor
+                    B = (tf.shape(input=fpropFactor)[0])  # batch size
+                    if opType == 'Conv2D':
+                        strides = fops.get_attr("strides")
+                        padding = fops.get_attr("padding")
+                        convkernel_size = var.get_shape()[0:3]
+                        KH = int(convkernel_size[0])
+                        KW = int(convkernel_size[1])
+                        C = int(convkernel_size[2])
+                        flatten_size = int(KH * KW * C)
+                        Oh = int(bpropFactor.get_shape()[1])
+                        Ow = int(bpropFactor.get_shape()[2])
+                        if Oh == 1 and Ow == 1 and self._channel_fac:
+                                # factorization along the channels
+                                # assume independence among input channels
+                                # factor = B x 1 x 1 x (KH xKW x C)
+                                # patches = B x Oh x Ow x (KH xKW x C)
+                            if len(SVD_factors) == 0:
+                                if KFAC_DEBUG:
+                                    print(('approx %s act factor with rank-1 SVD factors' % (var.name)))
+                                # find closest rank-1 approx to the feature map
+                                S, U, V = tf.batch_svd(tf.reshape(
+                                    fpropFactor, [-1, KH * KW, C]))
+                                # get rank-1 approx slides
+                                sqrtS1 = tf.expand_dims(tf.sqrt(S[:, 0, 0]), 1)
+                                patches_k = U[:, :, 0] * sqrtS1  # B x KH*KW
+                                full_factor_shape = fpropFactor.get_shape()
+                                patches_k.set_shape(
+                                    [full_factor_shape[0], KH * KW])
+                                patches_c = V[:, :, 0] * sqrtS1  # B x C
+                                patches_c.set_shape([full_factor_shape[0], C])
+                                SVD_factors[C] = patches_c
+                                SVD_factors[KH * KW] = patches_k
+                            fpropFactor = SVD_factors[stats_var_dim]
+                        else:
+                            # poor mem usage implementation
+                            patches = tf.image.extract_patches(fpropFactor, sizes=[1, convkernel_size[
+                                                               0], convkernel_size[1], 1], strides=strides, rates=[1, 1, 1, 1], padding=padding)
+                            if self._approxT2:
+                                if KFAC_DEBUG:
+                                    print(('approxT2 act fisher for %s' % (var.name)))
+                                # T^2 terms * 1/T^2, size: B x C
+                                fpropFactor = tf.reduce_mean(input_tensor=patches, axis=[1, 2])
+                            else:
+                                # size: (B x Oh x Ow) x C
+                                fpropFactor = tf.reshape(
+                                    patches, [-1, flatten_size]) / Oh / Ow
+                    fpropFactor_size = int(fpropFactor.get_shape()[-1])
+                    if stats_var_dim == (fpropFactor_size + 1) and not self._blockdiag_bias:
+                        if opType == 'Conv2D' and not self._approxT2:
+                            # correct padding for numerical stability (we
+                            # divided out OhxOw from activations for T1 approx)
+                            fpropFactor = tf.concat([fpropFactor, tf.ones(
+                                [tf.shape(input=fpropFactor)[0], 1]) / Oh / Ow], 1)
+                        else:
+                            # use homogeneous coordinates
+                            fpropFactor = tf.concat(
+                                [fpropFactor, tf.ones([tf.shape(input=fpropFactor)[0], 1])], 1)
+                    # average over the number of data points in a batch
+                    # divided by B
+                    cov = tf.matmul(fpropFactor, fpropFactor,
+                                    transpose_a=True) / tf.cast(B, tf.float32)
+                    updateOps.append(cov)
+                    statsUpdates[stats_var] = cov
+                    if opType != 'Conv2D':
+                        # HACK: for convolution we recompute fprop stats for
+                        # every layer including forking layers
+                        statsUpdates_cache[stats_var] = cov
+            for stats_var in bpropStats_vars:
+                stats_var_dim = int(stats_var.get_shape()[0])
+                if stats_var not in statsUpdates_cache:
+                    old_bpropFactor = bpropFactor
+                    bpropFactor_shape = bpropFactor.get_shape()
+                    B = tf.shape(input=bpropFactor)[0]  # batch size
+                    C = int(bpropFactor_shape[-1])  # num channels
+                    if opType == 'Conv2D' or len(bpropFactor_shape) == 4:
+                        if fpropFactor is not None:
+                            if self._approxT2:
+                                if KFAC_DEBUG:
+                                    print(('approxT2 grad fisher for %s' % (var.name)))
+                                bpropFactor = tf.reduce_sum(
+                                    input_tensor=bpropFactor, axis=[1, 2])  # T^2 terms * 1/T^2
+                            else:
+                                bpropFactor = tf.reshape(
+                                    bpropFactor, [-1, C]) * Oh * Ow  # T * 1/T terms
+                        else:
+                            # just doing block diag approx. spatial independent
+                            # structure does not apply here. summing over
+                            # spatial locations
+                            if KFAC_DEBUG:
+                                print(('block diag approx fisher for %s' % (var.name)))
+                            bpropFactor = tf.reduce_sum(input_tensor=bpropFactor, axis=[1, 2])
+                    # assume sampled loss is averaged. TO-DO:figure out better
+                    # way to handle this
+                    bpropFactor *= tf.cast(B, dtype=tf.float32)
+                    ##
+                    cov_b = tf.matmul(
+                        bpropFactor, bpropFactor, transpose_a=True) / tf.cast(tf.shape(input=bpropFactor)[0], dtype=tf.float32)
+                    updateOps.append(cov_b)
+                    statsUpdates[stats_var] = cov_b
+                    statsUpdates_cache[stats_var] = cov_b
+        if KFAC_DEBUG:
+            aKey = list(statsUpdates.keys())[0]
+            statsUpdates[aKey] = tf.compat.v1.Print(statsUpdates[aKey],
+                                          [tf.convert_to_tensor(value='step:'),
+                                           self.global_step,
+                                           tf.convert_to_tensor(
+                                               value='computing stats'),
+                                           ])
+        self.statsUpdates = statsUpdates
+        return statsUpdates
+    def apply_stats(self, statsUpdates):
+        """ compute stats and update/apply the new stats to the running average
+        """
+        def updateAccumStats():
+            if self._full_stats_init:
+                return tf.cond(pred=tf.greater(self.sgd_step, self._cold_iter), true_fn=lambda: tf.group(*self._apply_stats(statsUpdates, accumulate=True, accumulateCoeff=1. / self._stats_accum_iter)), false_fn=tf.no_op)
+            else:
+                return tf.group(*self._apply_stats(statsUpdates, accumulate=True, accumulateCoeff=1. / self._stats_accum_iter))
+        def updateRunningAvgStats(statsUpdates, fac_iter=1):
+            # return tf.cond(tf.greater_equal(self.factor_step,
+            # tf.convert_to_tensor(fac_iter)), lambda:
+            # tf.group(*self._apply_stats(stats_list, varlist)), tf.no_op)
+            return tf.group(*self._apply_stats(statsUpdates))
+        if self._async_stats:
+            # asynchronous stats update
+            update_stats = self._apply_stats(statsUpdates)
+            queue = tf.queue.FIFOQueue(1, [item.dtype for item in update_stats], shapes=[
+                                 item.get_shape() for item in update_stats])
+            enqueue_op = queue.enqueue(update_stats)
+            def dequeue_stats_op():
+                return queue.dequeue()
+            self.qr_stats = tf.compat.v1.train.QueueRunner(queue, [enqueue_op])
+            update_stats_op = tf.cond(pred=tf.equal(queue.size(), tf.convert_to_tensor(
+                value=0)), true_fn=tf.no_op, false_fn=lambda: tf.group(*[dequeue_stats_op(), ]))
+        else:
+            # synchronous stats update
+            update_stats_op = tf.cond(pred=tf.greater_equal(
+                self.stats_step, self._stats_accum_iter), true_fn=lambda: updateRunningAvgStats(statsUpdates), false_fn=updateAccumStats)
+        self._update_stats_op = update_stats_op
+        return update_stats_op
+    def _apply_stats(self, statsUpdates, accumulate=False, accumulateCoeff=0.):
+        updateOps = []
+        # obtain the stats var list
+        for stats_var in statsUpdates:
+            stats_new = statsUpdates[stats_var]
+            if accumulate:
+                # simple superbatch averaging
+                update_op = tf.compat.v1.assign_add(
+                    stats_var, accumulateCoeff * stats_new, use_locking=True)
+            else:
+                # exponential running averaging
+                update_op = tf.compat.v1.assign(
+                    stats_var, stats_var * self._stats_decay, use_locking=True)
+                update_op = tf.compat.v1.assign_add(
+                    update_op, (1. - self._stats_decay) * stats_new, use_locking=True)
+            updateOps.append(update_op)
+        with tf.control_dependencies(updateOps):
+            stats_step_op = tf.compat.v1.assign_add(self.stats_step, 1)
+        if KFAC_DEBUG:
+            stats_step_op = (tf.compat.v1.Print(stats_step_op,
+                                      [tf.convert_to_tensor(value='step:'),
+                                       self.global_step,
+                                       tf.convert_to_tensor(value='fac step:'),
+                                       self.factor_step,
+                                       tf.convert_to_tensor(value='sgd step:'),
+                                       self.sgd_step,
+                                       tf.convert_to_tensor(value='Accum:'),
+                                       tf.convert_to_tensor(value=accumulate),
+                                       tf.convert_to_tensor(value='Accum coeff:'),
+                                       tf.convert_to_tensor(value=accumulateCoeff),
+                                       tf.convert_to_tensor(value='stat step:'),
+                                       self.stats_step, updateOps[0], updateOps[1]]))
+        return [stats_step_op, ]
+    def getStatsEigen(self, stats=None):
+        if len(self.stats_eigen) == 0:
+            stats_eigen = {}
+            if stats is None:
+                stats = self.stats
+            tmpEigenCache = {}
+            with tf.device('/cpu:0'):
+                for var in stats:
+                    for key in ['fprop_concat_stats', 'bprop_concat_stats']:
+                        for stats_var in stats[var][key]:
+                            if stats_var not in tmpEigenCache:
+                                stats_dim = stats_var.get_shape()[1].value
+                                e = tf.Variable(tf.ones(
+                                    [stats_dim]), name='KFAC_FAC/' + stats_var.name.split(':')[0] + '/e', trainable=False)
+                                Q = tf.Variable(tf.linalg.tensor_diag(tf.ones(
+                                    [stats_dim])), name='KFAC_FAC/' + stats_var.name.split(':')[0] + '/Q', trainable=False)
+                                stats_eigen[stats_var] = {'e': e, 'Q': Q}
+                                tmpEigenCache[
+                                    stats_var] = stats_eigen[stats_var]
+                            else:
+                                stats_eigen[stats_var] = tmpEigenCache[
+                                    stats_var]
+            self.stats_eigen = stats_eigen
+        return self.stats_eigen
+    def computeStatsEigen(self):
+        """ compute the eigen decomp using copied var stats to avoid concurrent read/write from other queue """
+        # TO-DO: figure out why this op has delays (possibly moving
+        # eigenvectors around?)
+        with tf.device('/cpu:0'):
+            def removeNone(tensor_list):
+                local_list = []
+                for item in tensor_list:
+                    if item is not None:
+                        local_list.append(item)
+                return local_list
+            def copyStats(var_list):
+                print("copying stats to buffer tensors before eigen decomp")
+                redundant_stats = {}
+                copied_list = []
+                for item in var_list:
+                    if item is not None:
+                        if item not in redundant_stats:
+                            if self._use_float64:
+                                redundant_stats[item] = tf.cast(
+                                    tf.identity(item), tf.float64)
+                            else:
+                                redundant_stats[item] = tf.identity(item)
+                        copied_list.append(redundant_stats[item])
+                    else:
+                        copied_list.append(None)
+                return copied_list
+            #stats = [copyStats(self.fStats), copyStats(self.bStats)]
+            #stats = [self.fStats, self.bStats]
+            stats_eigen = self.stats_eigen
+            computedEigen = {}
+            eigen_reverse_lookup = {}
+            updateOps = []
+            # sync copied stats
+            # with tf.control_dependencies(removeNone(stats[0]) +
+            # removeNone(stats[1])):
+            with tf.control_dependencies([]):
+                for stats_var in stats_eigen:
+                    if stats_var not in computedEigen:
+                        eigens = tf.linalg.eigh(stats_var)
+                        e = eigens[0]
+                        Q = eigens[1]
+                        if self._use_float64:
+                            e = tf.cast(e, tf.float32)
+                            Q = tf.cast(Q, tf.float32)
+                        updateOps.append(e)
+                        updateOps.append(Q)
+                        computedEigen[stats_var] = {'e': e, 'Q': Q}
+                        eigen_reverse_lookup[e] = stats_eigen[stats_var]['e']
+                        eigen_reverse_lookup[Q] = stats_eigen[stats_var]['Q']
+            self.eigen_reverse_lookup = eigen_reverse_lookup
+            self.eigen_update_list = updateOps
+            if KFAC_DEBUG:
+                self.eigen_update_list = [item for item in updateOps]
+                with tf.control_dependencies(updateOps):
+                    updateOps.append(tf.compat.v1.Print(tf.constant(
+                        0.), [tf.convert_to_tensor(value='computed factor eigen')]))
+        return updateOps
+    def applyStatsEigen(self, eigen_list):
+        updateOps = []
+        print(('updating %d eigenvalue/vectors' % len(eigen_list)))
+        for i, (tensor, mark) in enumerate(zip(eigen_list, self.eigen_update_list)):
+            stats_eigen_var = self.eigen_reverse_lookup[mark]
+            updateOps.append(
+                tf.compat.v1.assign(stats_eigen_var, tensor, use_locking=True))
+        with tf.control_dependencies(updateOps):
+            factor_step_op = tf.compat.v1.assign_add(self.factor_step, 1)
+            updateOps.append(factor_step_op)
+            if KFAC_DEBUG:
+                updateOps.append(tf.compat.v1.Print(tf.constant(
+                    0.), [tf.convert_to_tensor(value='updated kfac factors')]))
+        return updateOps
+    def getKfacPrecondUpdates(self, gradlist, varlist):
+        updatelist = []
+        vg = 0.
+        assert len(self.stats) > 0
+        assert len(self.stats_eigen) > 0
+        assert len(self.factors) > 0
+        counter = 0
+        grad_dict = {var: grad for grad, var in zip(gradlist, varlist)}
+        for grad, var in zip(gradlist, varlist):
+            GRAD_RESHAPE = False
+            GRAD_TRANSPOSE = False
+            fpropFactoredFishers = self.stats[var]['fprop_concat_stats']
+            bpropFactoredFishers = self.stats[var]['bprop_concat_stats']
+            if (len(fpropFactoredFishers) + len(bpropFactoredFishers)) > 0:
+                counter += 1
+                GRAD_SHAPE = grad.get_shape()
+                if len(grad.get_shape()) > 2:
+                    # reshape conv kernel parameters
+                    KW = int(grad.get_shape()[0])
+                    KH = int(grad.get_shape()[1])
+                    C = int(grad.get_shape()[2])
+                    D = int(grad.get_shape()[3])
+                    if len(fpropFactoredFishers) > 1 and self._channel_fac:
+                        # reshape conv kernel parameters into tensor
+                        grad = tf.reshape(grad, [KW * KH, C, D])
+                    else:
+                        # reshape conv kernel parameters into 2D grad
+                        grad = tf.reshape(grad, [-1, D])
+                    GRAD_RESHAPE = True
+                elif len(grad.get_shape()) == 1:
+                    # reshape bias or 1D parameters
+                    D = int(grad.get_shape()[0])
+                    grad = tf.expand_dims(grad, 0)
+                    GRAD_RESHAPE = True
+                else:
+                    # 2D parameters
+                    C = int(grad.get_shape()[0])
+                    D = int(grad.get_shape()[1])
+                if (self.stats[var]['assnBias'] is not None) and not self._blockdiag_bias:
+                    # use homogeneous coordinates only works for 2D grad.
+                    # TO-DO: figure out how to factorize bias grad
+                    # stack bias grad
+                    var_assnBias = self.stats[var]['assnBias']
+                    grad = tf.concat(
+                        [grad, tf.expand_dims(grad_dict[var_assnBias], 0)], 0)
+                # project gradient to eigen space and reshape the eigenvalues
+                # for broadcasting
+                eigVals = []
+                for idx, stats in enumerate(self.stats[var]['fprop_concat_stats']):
+                    Q = self.stats_eigen[stats]['Q']
+                    e = detectMinVal(self.stats_eigen[stats][
+                                     'e'], var, name='act', debug=KFAC_DEBUG)
+                    Q, e = factorReshape(Q, e, grad, facIndx=idx, ftype='act')
+                    eigVals.append(e)
+                    grad = gmatmul(Q, grad, transpose_a=True, reduce_dim=idx)
+                for idx, stats in enumerate(self.stats[var]['bprop_concat_stats']):
+                    Q = self.stats_eigen[stats]['Q']
+                    e = detectMinVal(self.stats_eigen[stats][
+                                     'e'], var, name='grad', debug=KFAC_DEBUG)
+                    Q, e = factorReshape(Q, e, grad, facIndx=idx, ftype='grad')
+                    eigVals.append(e)
+                    grad = gmatmul(grad, Q, transpose_b=False, reduce_dim=idx)
+                ##
+                #####
+                # whiten using eigenvalues
+                weightDecayCoeff = 0.
+                if var in self._weight_decay_dict:
+                    weightDecayCoeff = self._weight_decay_dict[var]
+                    if KFAC_DEBUG:
+                        print(('weight decay coeff for %s is %f' % (var.name, weightDecayCoeff)))
+                if self._factored_damping:
+                    if KFAC_DEBUG:
+                        print(('use factored damping for %s' % (var.name)))
+                    coeffs = 1.
+                    num_factors = len(eigVals)
+                    # compute the ratio of two trace norm of the left and right
+                    # KFac matrices, and their generalization
+                    if len(eigVals) == 1:
+                        damping = self._epsilon + weightDecayCoeff
+                    else:
+                        damping = tf.pow(
+                            self._epsilon + weightDecayCoeff, 1. / num_factors)
+                    eigVals_tnorm_avg = [tf.reduce_mean(
+                        input_tensor=tf.abs(e)) for e in eigVals]
+                    for e, e_tnorm in zip(eigVals, eigVals_tnorm_avg):
+                        eig_tnorm_negList = [
+                            item for item in eigVals_tnorm_avg if item != e_tnorm]
+                        if len(eigVals) == 1:
+                            adjustment = 1.
+                        elif len(eigVals) == 2:
+                            adjustment = tf.sqrt(
+                                e_tnorm / eig_tnorm_negList[0])
+                        else:
+                            eig_tnorm_negList_prod = reduce(
+                                lambda x, y: x * y, eig_tnorm_negList)
+                            adjustment = tf.pow(
+                                tf.pow(e_tnorm, num_factors - 1.) / eig_tnorm_negList_prod, 1. / num_factors)
+                        coeffs *= (e + adjustment * damping)
+                else:
+                    coeffs = 1.
+                    damping = (self._epsilon + weightDecayCoeff)
+                    for e in eigVals:
+                        coeffs *= e
+                    coeffs += damping
+                #grad = tf.Print(grad, [tf.convert_to_tensor('1'), tf.convert_to_tensor(var.name), grad.get_shape()])
+                grad /= coeffs
+                #grad = tf.Print(grad, [tf.convert_to_tensor('2'), tf.convert_to_tensor(var.name), grad.get_shape()])
+                #####
+                # project gradient back to euclidean space
+                for idx, stats in enumerate(self.stats[var]['fprop_concat_stats']):
+                    Q = self.stats_eigen[stats]['Q']
+                    grad = gmatmul(Q, grad, transpose_a=False, reduce_dim=idx)
+                for idx, stats in enumerate(self.stats[var]['bprop_concat_stats']):
+                    Q = self.stats_eigen[stats]['Q']
+                    grad = gmatmul(grad, Q, transpose_b=True, reduce_dim=idx)
+                ##
+                #grad = tf.Print(grad, [tf.convert_to_tensor('3'), tf.convert_to_tensor(var.name), grad.get_shape()])
+                if (self.stats[var]['assnBias'] is not None) and not self._blockdiag_bias:
+                    # use homogeneous coordinates only works for 2D grad.
+                    # TO-DO: figure out how to factorize bias grad
+                    # un-stack bias grad
+                    var_assnBias = self.stats[var]['assnBias']
+                    C_plus_one = int(grad.get_shape()[0])
+                    grad_assnBias = tf.reshape(tf.slice(grad,
+                                                        begin=[
+                                                            C_plus_one - 1, 0],
+                                                        size=[1, -1]), var_assnBias.get_shape())
+                    grad_assnWeights = tf.slice(grad,
+                                                begin=[0, 0],
+                                                size=[C_plus_one - 1, -1])
+                    grad_dict[var_assnBias] = grad_assnBias
+                    grad = grad_assnWeights
+                #grad = tf.Print(grad, [tf.convert_to_tensor('4'), tf.convert_to_tensor(var.name), grad.get_shape()])
+                if GRAD_RESHAPE:
+                    grad = tf.reshape(grad, GRAD_SHAPE)
+                grad_dict[var] = grad
+        print(('projecting %d gradient matrices' % counter))
+        for g, var in zip(gradlist, varlist):
+            grad = grad_dict[var]
+            ### clipping ###
+            if KFAC_DEBUG:
+                print(('apply clipping to %s' % (var.name)))
+            tf.compat.v1.Print(grad, [tf.sqrt(tf.reduce_sum(input_tensor=tf.pow(grad, 2)))], "Euclidean norm of new grad")
+            local_vg = tf.reduce_sum(input_tensor=grad * g * (self._lr * self._lr))
+            vg += local_vg
+        # recale everything
+        if KFAC_DEBUG:
+            print('apply vFv clipping')
+        scaling = tf.minimum(1., tf.sqrt(self._clip_kl / vg))
+        if KFAC_DEBUG:
+            scaling = tf.compat.v1.Print(scaling, [tf.convert_to_tensor(
+                value='clip: '), scaling, tf.convert_to_tensor(value=' vFv: '), vg])
+        with tf.control_dependencies([tf.compat.v1.assign(self.vFv, vg)]):
+            updatelist = [grad_dict[var] for var in varlist]
+            for i, item in enumerate(updatelist):
+                updatelist[i] = scaling * item
+        return updatelist
+    def compute_gradients(self, loss, var_list=None):
+        varlist = var_list
+        if varlist is None:
+            varlist = tf.compat.v1.trainable_variables()
+        g = tf.gradients(ys=loss, xs=varlist)
+        return [(a, b) for a, b in zip(g, varlist)]
+    def apply_gradients_kfac(self, grads):
+        g, varlist = list(zip(*grads))
+        if len(self.stats_eigen) == 0:
+            self.getStatsEigen()
+        qr = None
+        # launch eigen-decomp on a queue thread
+        if self._async:
+            print('Use async eigen decomp')
+            # get a list of factor loading tensors
+            factorOps_dummy = self.computeStatsEigen()
+            # define a queue for the list of factor loading tensors
+            queue = tf.queue.FIFOQueue(1, [item.dtype for item in factorOps_dummy], shapes=[
+                                 item.get_shape() for item in factorOps_dummy])
+            enqueue_op = tf.cond(pred=tf.logical_and(tf.equal(tf.math.floormod(self.stats_step, self._kfac_update), tf.convert_to_tensor(
+                value=0)), tf.greater_equal(self.stats_step, self._stats_accum_iter)), true_fn=lambda: queue.enqueue(self.computeStatsEigen()), false_fn=tf.no_op)
+            def dequeue_op():
+                return queue.dequeue()
+            qr = tf.compat.v1.train.QueueRunner(queue, [enqueue_op])
+        updateOps = []
+        global_step_op = tf.compat.v1.assign_add(self.global_step, 1)
+        updateOps.append(global_step_op)
+        with tf.control_dependencies([global_step_op]):
+            # compute updates
+            assert self._update_stats_op != None
+            updateOps.append(self._update_stats_op)
+            dependency_list = []
+            if not self._async:
+                dependency_list.append(self._update_stats_op)
+            with tf.control_dependencies(dependency_list):
+                def no_op_wrapper():
+                    return tf.group(*[tf.compat.v1.assign_add(self.cold_step, 1)])
+                if not self._async:
+                    # synchronous eigen-decomp updates
+                    updateFactorOps = tf.cond(pred=tf.logical_and(tf.equal(tf.math.floormod(self.stats_step, self._kfac_update),
+                                                                      tf.convert_to_tensor(value=0)),
+                                                             tf.greater_equal(self.stats_step, self._stats_accum_iter)), true_fn=lambda: tf.group(*self.applyStatsEigen(self.computeStatsEigen())), false_fn=no_op_wrapper)
+                else:
+                    # asynchronous eigen-decomp updates using queue
+                    updateFactorOps = tf.cond(pred=tf.greater_equal(self.stats_step, self._stats_accum_iter),
+                                              true_fn=lambda: tf.cond(pred=tf.equal(queue.size(), tf.convert_to_tensor(value=0)),
+                                                              true_fn=tf.no_op,
+                                                              false_fn=lambda: tf.group(
+                                                                  *self.applyStatsEigen(dequeue_op())),
+                                                              ),
+                                              false_fn=no_op_wrapper)
+                updateOps.append(updateFactorOps)
+                with tf.control_dependencies([updateFactorOps]):
+                    def gradOp():
+                        return list(g)
+                    def getKfacGradOp():
+                        return self.getKfacPrecondUpdates(g, varlist)
+                    u = tf.cond(pred=tf.greater(self.factor_step,
+                                           tf.convert_to_tensor(value=0)), true_fn=getKfacGradOp, false_fn=gradOp)
+                    optim = tf.compat.v1.train.MomentumOptimizer(
+                        self._lr * (1. - self._momentum), self._momentum)
+                    #optim = tf.train.AdamOptimizer(self._lr, epsilon=0.01)
+                    def optimOp():
+                        def updateOptimOp():
+                            if self._full_stats_init:
+                                return tf.cond(pred=tf.greater(self.factor_step, tf.convert_to_tensor(value=0)), true_fn=lambda: optim.apply_gradients(list(zip(u, varlist))), false_fn=tf.no_op)
+                            else:
+                                return optim.apply_gradients(list(zip(u, varlist)))
+                        if self._full_stats_init:
+                            return tf.cond(pred=tf.greater_equal(self.stats_step, self._stats_accum_iter), true_fn=updateOptimOp, false_fn=tf.no_op)
+                        else:
+                            return tf.cond(pred=tf.greater_equal(self.sgd_step, self._cold_iter), true_fn=updateOptimOp, false_fn=tf.no_op)
+                    updateOps.append(optimOp())
+        return tf.group(*updateOps), qr
+    def apply_gradients(self, grads):
+        coldOptim = tf.compat.v1.train.MomentumOptimizer(
+            self._cold_lr, self._momentum)
+        def coldSGDstart():
+            sgd_grads, sgd_var = zip(*grads)
+            if self.max_grad_norm != None:
+                sgd_grads, sgd_grad_norm = tf.clip_by_global_norm(sgd_grads,self.max_grad_norm)
+            sgd_grads = list(zip(sgd_grads,sgd_var))
+            sgd_step_op = tf.compat.v1.assign_add(self.sgd_step, 1)
+            coldOptim_op = coldOptim.apply_gradients(sgd_grads)
+            if KFAC_DEBUG:
+                with tf.control_dependencies([sgd_step_op, coldOptim_op]):
+                    sgd_step_op = tf.compat.v1.Print(
+                        sgd_step_op, [self.sgd_step, tf.convert_to_tensor(value='doing cold sgd step')])
+            return tf.group(*[sgd_step_op, coldOptim_op])
+        kfacOptim_op, qr = self.apply_gradients_kfac(grads)
+        def warmKFACstart():
+            return kfacOptim_op
+        return tf.cond(pred=tf.greater(self.sgd_step, self._cold_iter), true_fn=warmKFACstart, false_fn=coldSGDstart), qr
+    def minimize(self, loss, loss_sampled, var_list=None):
+        grads = self.compute_gradients(loss, var_list=var_list)
+        update_stats_op = self.compute_and_apply_stats(
+            loss_sampled, var_list=var_list)
+        return self.apply_gradients(grads)

baselines/acktr/kfac_utils.py ADDED Viewed

	@@ -0,0 +1,86 @@

+import tensorflow as tf
+def gmatmul(a, b, transpose_a=False, transpose_b=False, reduce_dim=None):
+    assert reduce_dim is not None
+    # weird batch matmul
+    if len(a.get_shape()) == 2 and len(b.get_shape()) > 2:
+        # reshape reduce_dim to the left most dim in b
+        b_shape = b.get_shape()
+        if reduce_dim != 0:
+            b_dims = list(range(len(b_shape)))
+            b_dims.remove(reduce_dim)
+            b_dims.insert(0, reduce_dim)
+            b = tf.transpose(a=b, perm=b_dims)
+        b_t_shape = b.get_shape()
+        b = tf.reshape(b, [int(b_shape[reduce_dim]), -1])
+        result = tf.matmul(a, b, transpose_a=transpose_a,
+                           transpose_b=transpose_b)
+        result = tf.reshape(result, b_t_shape)
+        if reduce_dim != 0:
+            b_dims = list(range(len(b_shape)))
+            b_dims.remove(0)
+            b_dims.insert(reduce_dim, 0)
+            result = tf.transpose(a=result, perm=b_dims)
+        return result
+    elif len(a.get_shape()) > 2 and len(b.get_shape()) == 2:
+        # reshape reduce_dim to the right most dim in a
+        a_shape = a.get_shape()
+        outter_dim = len(a_shape) - 1
+        reduce_dim = len(a_shape) - reduce_dim - 1
+        if reduce_dim != outter_dim:
+            a_dims = list(range(len(a_shape)))
+            a_dims.remove(reduce_dim)
+            a_dims.insert(outter_dim, reduce_dim)
+            a = tf.transpose(a=a, perm=a_dims)
+        a_t_shape = a.get_shape()
+        a = tf.reshape(a, [-1, int(a_shape[reduce_dim])])
+        result = tf.matmul(a, b, transpose_a=transpose_a,
+                           transpose_b=transpose_b)
+        result = tf.reshape(result, a_t_shape)
+        if reduce_dim != outter_dim:
+            a_dims = list(range(len(a_shape)))
+            a_dims.remove(outter_dim)
+            a_dims.insert(reduce_dim, outter_dim)
+            result = tf.transpose(a=result, perm=a_dims)
+        return result
+    elif len(a.get_shape()) == 2 and len(b.get_shape()) == 2:
+        return tf.matmul(a, b, transpose_a=transpose_a, transpose_b=transpose_b)
+    assert False, 'something went wrong'
+def clipoutNeg(vec, threshold=1e-6):
+    mask = tf.cast(vec > threshold, tf.float32)
+    return mask * vec
+def detectMinVal(input_mat, var, threshold=1e-6, name='', debug=False):
+    eigen_min = tf.reduce_min(input_tensor=input_mat)
+    eigen_max = tf.reduce_max(input_tensor=input_mat)
+    eigen_ratio = eigen_max / eigen_min
+    input_mat_clipped = clipoutNeg(input_mat, threshold)
+    if debug:
+        input_mat_clipped = tf.cond(pred=tf.logical_or(tf.greater(eigen_ratio, 0.), tf.less(eigen_ratio, -500)), true_fn=lambda: input_mat_clipped, false_fn=lambda: tf.compat.v1.Print(
+            input_mat_clipped, [tf.convert_to_tensor(value='screwed ratio ' + name + ' eigen values!!!'), tf.convert_to_tensor(value=var.name), eigen_min, eigen_max, eigen_ratio]))
+    return input_mat_clipped
+def factorReshape(Q, e, grad, facIndx=0, ftype='act'):
+    grad_shape = grad.get_shape()
+    if ftype == 'act':
+        assert e.get_shape()[0] == grad_shape[facIndx]
+        expanded_shape = [1, ] * len(grad_shape)
+        expanded_shape[facIndx] = -1
+        e = tf.reshape(e, expanded_shape)
+    if ftype == 'grad':
+        assert e.get_shape()[0] == grad_shape[len(grad_shape) - facIndx - 1]
+        expanded_shape = [1, ] * len(grad_shape)
+        expanded_shape[len(grad_shape) - facIndx - 1] = -1
+        e = tf.reshape(e, expanded_shape)
+    return Q, e

baselines/acktr/utils.py ADDED Viewed

	@@ -0,0 +1,28 @@

+import tensorflow as tf
+def dense(x, size, name, weight_init=None, bias_init=0, weight_loss_dict=None, reuse=None):
+    with tf.compat.v1.variable_scope(name, reuse=reuse):
+        assert (len(tf.compat.v1.get_variable_scope().name.split('/')) == 2)
+        w = tf.compat.v1.get_variable("w", [x.get_shape()[1], size], initializer=weight_init)
+        b = tf.compat.v1.get_variable("b", [size], initializer=tf.compat.v1.constant_initializer(bias_init))
+        weight_decay_fc = 3e-4
+        if weight_loss_dict is not None:
+            weight_decay = tf.multiply(tf.nn.l2_loss(w), weight_decay_fc, name='weight_decay_loss')
+            if weight_loss_dict is not None:
+                weight_loss_dict[w] = weight_decay_fc
+                weight_loss_dict[b] = 0.0
+            tf.compat.v1.add_to_collection(tf.compat.v1.get_variable_scope().name.split('/')[0] + '_' + 'losses', weight_decay)
+        return tf.nn.bias_add(tf.matmul(x, w), b)
+def kl_div(action_dist1, action_dist2, action_size):
+    mean1, std1 = action_dist1[:, :action_size], action_dist1[:, action_size:]
+    mean2, std2 = action_dist2[:, :action_size], action_dist2[:, action_size:]
+    numerator = tf.square(mean1 - mean2) + tf.square(std1) - tf.square(std2)
+    denominator = 2 * tf.square(std2) + 1e-8
+    return tf.reduce_sum(
+        input_tensor=numerator/denominator + tf.math.log(std2) - tf.math.log(std1),axis=-1)

baselines/bench/__init__.py ADDED Viewed

	@@ -0,0 +1,3 @@

+# flake8: noqa F403
+from baselines.bench.benchmarks import *
+from baselines.bench.monitor import *

baselines/bench/benchmarks.py ADDED Viewed

	@@ -0,0 +1,164 @@

+import re
+import os
+SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
+_atari7 = ['BeamRider', 'Breakout', 'Enduro', 'Pong', 'Qbert', 'Seaquest', 'SpaceInvaders']
+_atariexpl7 = ['Freeway', 'Gravitar', 'MontezumaRevenge', 'Pitfall', 'PrivateEye', 'Solaris', 'Venture']
+_BENCHMARKS = []
+remove_version_re = re.compile(r'-v\d+$')
+def register_benchmark(benchmark):
+    for b in _BENCHMARKS:
+        if b['name'] == benchmark['name']:
+            raise ValueError('Benchmark with name %s already registered!' % b['name'])
+    # automatically add a description if it is not present
+    if 'tasks' in benchmark:
+        for t in benchmark['tasks']:
+            if 'desc' not in t:
+                t['desc'] = remove_version_re.sub('', t.get('env_id', t.get('id')))
+    _BENCHMARKS.append(benchmark)
+def list_benchmarks():
+    return [b['name'] for b in _BENCHMARKS]
+def get_benchmark(benchmark_name):
+    for b in _BENCHMARKS:
+        if b['name'] == benchmark_name:
+            return b
+    raise ValueError('%s not found! Known benchmarks: %s' % (benchmark_name, list_benchmarks()))
+def get_task(benchmark, env_id):
+    """Get a task by env_id. Return None if the benchmark doesn't have the env"""
+    return next(filter(lambda task: task['env_id'] == env_id, benchmark['tasks']), None)
+def find_task_for_env_id_in_any_benchmark(env_id):
+    for bm in _BENCHMARKS:
+        for task in bm["tasks"]:
+            if task["env_id"] == env_id:
+                return bm, task
+    return None, None
+_ATARI_SUFFIX = 'NoFrameskip-v4'
+register_benchmark({
+    'name': 'Atari50M',
+    'description': '7 Atari games from Mnih et al. (2013), with pixel observations, 50M timesteps',
+    'tasks': [{'desc': _game, 'env_id': _game + _ATARI_SUFFIX, 'trials': 2, 'num_timesteps': int(50e6)} for _game in _atari7]
+})
+register_benchmark({
+    'name': 'Atari10M',
+    'description': '7 Atari games from Mnih et al. (2013), with pixel observations, 10M timesteps',
+    'tasks': [{'desc': _game, 'env_id': _game + _ATARI_SUFFIX, 'trials': 6, 'num_timesteps': int(10e6)} for _game in _atari7]
+})
+register_benchmark({
+    'name': 'Atari1Hr',
+    'description': '7 Atari games from Mnih et al. (2013), with pixel observations, 1 hour of walltime',
+    'tasks': [{'desc': _game, 'env_id': _game + _ATARI_SUFFIX, 'trials': 2, 'num_seconds': 60 * 60} for _game in _atari7]
+})
+register_benchmark({
+    'name': 'AtariExploration10M',
+    'description': '7 Atari games emphasizing exploration, with pixel observations, 10M timesteps',
+    'tasks': [{'desc': _game, 'env_id': _game + _ATARI_SUFFIX, 'trials': 2, 'num_timesteps': int(10e6)} for _game in _atariexpl7]
+})
+# MuJoCo
+_mujocosmall = [
+    'InvertedDoublePendulum-v2', 'InvertedPendulum-v2',
+    'HalfCheetah-v2', 'Hopper-v2', 'Walker2d-v2',
+    'Reacher-v2', 'Swimmer-v2']
+register_benchmark({
+    'name': 'Mujoco1M',
+    'description': 'Some small 2D MuJoCo tasks, run for 1M timesteps',
+    'tasks': [{'env_id': _envid, 'trials': 6, 'num_timesteps': int(1e6)} for _envid in _mujocosmall]
+})
+register_benchmark({
+    'name': 'MujocoWalkers',
+    'description': 'MuJoCo forward walkers, run for 8M, humanoid 100M',
+    'tasks': [
+        {'env_id': "Hopper-v1", 'trials': 4, 'num_timesteps': 8 * 1000000},
+        {'env_id': "Walker2d-v1", 'trials': 4, 'num_timesteps': 8 * 1000000},
+        {'env_id': "Humanoid-v1", 'trials': 4, 'num_timesteps': 100 * 1000000},
+    ]
+})
+# Bullet
+_bulletsmall = [
+    'InvertedDoublePendulum', 'InvertedPendulum', 'HalfCheetah', 'Reacher', 'Walker2D', 'Hopper', 'Ant'
+]
+_bulletsmall = [e + 'BulletEnv-v0' for e in _bulletsmall]
+register_benchmark({
+    'name': 'Bullet1M',
+    'description': '6 mujoco-like tasks from bullet, 1M steps',
+    'tasks': [{'env_id': e, 'trials': 6, 'num_timesteps': int(1e6)} for e in _bulletsmall]
+})
+# Roboschool
+register_benchmark({
+    'name': 'Roboschool8M',
+    'description': 'Small 2D tasks, up to 30 minutes to complete on 8 cores',
+    'tasks': [
+        {'env_id': "RoboschoolReacher-v1", 'trials': 4, 'num_timesteps': 2 * 1000000},
+        {'env_id': "RoboschoolAnt-v1", 'trials': 4, 'num_timesteps': 8 * 1000000},
+        {'env_id': "RoboschoolHalfCheetah-v1", 'trials': 4, 'num_timesteps': 8 * 1000000},
+        {'env_id': "RoboschoolHopper-v1", 'trials': 4, 'num_timesteps': 8 * 1000000},
+        {'env_id': "RoboschoolWalker2d-v1", 'trials': 4, 'num_timesteps': 8 * 1000000},
+    ]
+})
+register_benchmark({
+    'name': 'RoboschoolHarder',
+    'description': 'Test your might!!! Up to 12 hours on 32 cores',
+    'tasks': [
+        {'env_id': "RoboschoolHumanoid-v1", 'trials': 4, 'num_timesteps': 100 * 1000000},
+        {'env_id': "RoboschoolHumanoidFlagrun-v1", 'trials': 4, 'num_timesteps': 200 * 1000000},
+        {'env_id': "RoboschoolHumanoidFlagrunHarder-v1", 'trials': 4, 'num_timesteps': 400 * 1000000},
+    ]
+})
+# Other
+_atari50 = [  # actually 47
+    'Alien', 'Amidar', 'Assault', 'Asterix', 'Asteroids',
+    'Atlantis', 'BankHeist', 'BattleZone', 'BeamRider', 'Bowling',
+    'Breakout', 'Centipede', 'ChopperCommand', 'CrazyClimber',
+    'DemonAttack', 'DoubleDunk', 'Enduro', 'FishingDerby', 'Freeway',
+    'Frostbite', 'Gopher', 'Gravitar', 'IceHockey', 'Jamesbond',
+    'Kangaroo', 'Krull', 'KungFuMaster', 'MontezumaRevenge', 'MsPacman',
+    'NameThisGame', 'Pitfall', 'Pong', 'PrivateEye', 'Qbert',
+    'RoadRunner', 'Robotank', 'Seaquest', 'SpaceInvaders', 'StarGunner',
+    'Tennis', 'TimePilot', 'Tutankham', 'UpNDown', 'Venture',
+    'VideoPinball', 'WizardOfWor', 'Zaxxon',
+]
+register_benchmark({
+    'name': 'Atari50_10M',
+    'description': '47 Atari games from Mnih et al. (2013), with pixel observations, 10M timesteps',
+    'tasks': [{'desc': _game, 'env_id': _game + _ATARI_SUFFIX, 'trials': 2, 'num_timesteps': int(10e6)} for _game in _atari50]
+})
+# HER DDPG
+_fetch_tasks = ['FetchReach-v1', 'FetchPush-v1', 'FetchSlide-v1']
+register_benchmark({
+    'name': 'Fetch1M',
+    'description': 'Fetch* benchmarks for 1M timesteps',
+    'tasks': [{'trials': 6, 'env_id': env_id, 'num_timesteps': int(1e6)} for env_id in _fetch_tasks]
+})

baselines/bench/monitor.py ADDED Viewed

	@@ -0,0 +1,162 @@

+__all__ = ['Monitor', 'get_monitor_files', 'load_results']
+from gym.core import Wrapper
+import time
+from glob import glob
+import csv
+import os.path as osp
+import json
+class Monitor(Wrapper):
+    EXT = "monitor.csv"
+    f = None
+    def __init__(self, env, filename, allow_early_resets=False, reset_keywords=(), info_keywords=()):
+        Wrapper.__init__(self, env=env)
+        self.tstart = time.time()
+        if filename:
+            self.results_writer = ResultsWriter(filename,
+                header={"t_start": time.time(), 'env_id' : env.spec and env.spec.id},
+                extra_keys=reset_keywords + info_keywords
+            )
+        else:
+            self.results_writer = None
+        self.reset_keywords = reset_keywords
+        self.info_keywords = info_keywords
+        self.allow_early_resets = allow_early_resets
+        self.rewards = None
+        self.needs_reset = True
+        self.episode_rewards = []
+        self.episode_lengths = []
+        self.episode_times = []
+        self.total_steps = 0
+        self.current_reset_info = {} # extra info about the current episode, that was passed in during reset()
+    def reset(self, **kwargs):
+        self.reset_state()
+        for k in self.reset_keywords:
+            v = kwargs.get(k)
+            if v is None:
+                raise ValueError('Expected you to pass kwarg %s into reset'%k)
+            self.current_reset_info[k] = v
+        return self.env.reset(**kwargs)
+    def reset_state(self):
+        if not self.allow_early_resets and not self.needs_reset:
+            raise RuntimeError("Tried to reset an environment before done. If you want to allow early resets, wrap your env with Monitor(env, path, allow_early_resets=True)")
+        self.rewards = []
+        self.needs_reset = False
+    def step(self, action):
+        if self.needs_reset:
+            raise RuntimeError("Tried to step environment that needs reset")
+        ob, rew, done, info = self.env.step(action)
+        self.update(ob, rew, done, info)
+        return (ob, rew, done, info)
+    def update(self, ob, rew, done, info):
+        self.rewards.append(rew)
+        if done:
+            self.needs_reset = True
+            eprew = sum(self.rewards)
+            eplen = len(self.rewards)
+            epinfo = {"r": round(eprew, 6), "l": eplen, "t": round(time.time() - self.tstart, 6)}
+            for k in self.info_keywords:
+                epinfo[k] = info[k]
+            self.episode_rewards.append(eprew)
+            self.episode_lengths.append(eplen)
+            self.episode_times.append(time.time() - self.tstart)
+            epinfo.update(self.current_reset_info)
+            if self.results_writer:
+                self.results_writer.write_row(epinfo)
+            assert isinstance(info, dict)
+            if isinstance(info, dict):
+                info['episode'] = epinfo
+        self.total_steps += 1
+    def close(self):
+        if self.f is not None:
+            self.f.close()
+    def get_total_steps(self):
+        return self.total_steps
+    def get_episode_rewards(self):
+        return self.episode_rewards
+    def get_episode_lengths(self):
+        return self.episode_lengths
+    def get_episode_times(self):
+        return self.episode_times
+class LoadMonitorResultsError(Exception):
+    pass
+class ResultsWriter(object):
+    def __init__(self, filename, header='', extra_keys=()):
+        self.extra_keys = extra_keys
+        assert filename is not None
+        if not filename.endswith(Monitor.EXT):
+            if osp.isdir(filename):
+                filename = osp.join(filename, Monitor.EXT)
+            else:
+                filename = filename + "." + Monitor.EXT
+        self.f = open(filename, "wt")
+        if isinstance(header, dict):
+            header = '# {} \n'.format(json.dumps(header))
+        self.f.write(header)
+        self.logger = csv.DictWriter(self.f, fieldnames=('r', 'l', 't')+tuple(extra_keys))
+        self.logger.writeheader()
+        self.f.flush()
+    def write_row(self, epinfo):
+        if self.logger:
+            self.logger.writerow(epinfo)
+            self.f.flush()
+def get_monitor_files(dir):
+    return glob(osp.join(dir, "*" + Monitor.EXT))
+def load_results(dir):
+    import pandas
+    monitor_files = (
+        glob(osp.join(dir, "*monitor.json")) +
+        glob(osp.join(dir, "*monitor.csv"))) # get both csv and (old) json files
+    if not monitor_files:
+        raise LoadMonitorResultsError("no monitor files of the form *%s found in %s" % (Monitor.EXT, dir))
+    dfs = []
+    headers = []
+    for fname in monitor_files:
+        with open(fname, 'rt') as fh:
+            if fname.endswith('csv'):
+                firstline = fh.readline()
+                if not firstline:
+                    continue
+                assert firstline[0] == '#'
+                header = json.loads(firstline[1:])
+                df = pandas.read_csv(fh, index_col=None)
+                headers.append(header)
+            elif fname.endswith('json'): # Deprecated json format
+                episodes = []
+                lines = fh.readlines()
+                header = json.loads(lines[0])
+                headers.append(header)
+                for line in lines[1:]:
+                    episode = json.loads(line)
+                    episodes.append(episode)
+                df = pandas.DataFrame(episodes)
+            else:
+                assert 0, 'unreachable'
+            df['t'] += header['t_start']
+        dfs.append(df)
+    df = pandas.concat(dfs)
+    df.sort_values('t', inplace=True)
+    df.reset_index(inplace=True)
+    df['t'] -= min(header['t_start'] for header in headers)
+    df.headers = headers # HACK to preserve backwards compatibility
+    return df

baselines/bench/test_monitor.py ADDED Viewed

	@@ -0,0 +1,31 @@

+from .monitor import Monitor
+import gym
+import json
+def test_monitor():
+    import pandas
+    import os
+    import uuid
+    env = gym.make("CartPole-v1")
+    env.seed(0)
+    mon_file = "/tmp/baselines-test-%s.monitor.csv" % uuid.uuid4()
+    menv = Monitor(env, mon_file)
+    menv.reset()
+    for _ in range(1000):
+        _, _, done, _ = menv.step(0)
+        if done:
+            menv.reset()
+    f = open(mon_file, 'rt')
+    firstline = f.readline()
+    assert firstline.startswith('#')
+    metadata = json.loads(firstline[1:])
+    assert metadata['env_id'] == "CartPole-v1"
+    assert set(metadata.keys()) == {'env_id', 't_start'},  "Incorrect keys in monitor metadata"
+    last_logline = pandas.read_csv(f, index_col=None)
+    assert set(last_logline.keys()) == {'l', 't', 'r'}, "Incorrect keys in monitor logline"
+    f.close()
+    os.remove(mon_file)

baselines/common/__init__.py ADDED Viewed

	@@ -0,0 +1,5 @@

+# flake8: noqa F403
+from baselines.common.console_util import *
+from baselines.common.dataset import Dataset
+from baselines.common.math_util import *
+from baselines.common.misc_util import *

baselines/common/atari_wrappers.py ADDED Viewed

	@@ -0,0 +1,290 @@

+import numpy as np
+import os
+os.environ.setdefault('PATH', '')
+from collections import deque
+import gym
+from gym import spaces
+import cv2
+cv2.ocl.setUseOpenCL(False)
+from .wrappers import TimeLimit
+class NoopResetEnv(gym.Wrapper):
+    def __init__(self, env, noop_max=30):
+        """Sample initial states by taking random number of no-ops on reset.
+        No-op is assumed to be action 0.
+        """
+        gym.Wrapper.__init__(self, env)
+        self.noop_max = noop_max
+        self.override_num_noops = None
+        self.noop_action = 0
+        assert env.unwrapped.get_action_meanings()[0] == 'NOOP'
+    def reset(self, **kwargs):
+        """ Do no-op action for a number of steps in [1, noop_max]."""
+        self.env.reset(**kwargs)
+        if self.override_num_noops is not None:
+            noops = self.override_num_noops
+        else:
+            noops = self.unwrapped.np_random.randint(1, self.noop_max + 1) #pylint: disable=E1101
+        assert noops > 0
+        obs = None
+        for _ in range(noops):
+            obs, _, done, _ = self.env.step(self.noop_action)
+            if done:
+                obs = self.env.reset(**kwargs)
+        return obs
+    def step(self, ac):
+        return self.env.step(ac)
+class FireResetEnv(gym.Wrapper):
+    def __init__(self, env):
+        """Take action on reset for environments that are fixed until firing."""
+        gym.Wrapper.__init__(self, env)
+        assert env.unwrapped.get_action_meanings()[1] == 'FIRE'
+        assert len(env.unwrapped.get_action_meanings()) >= 3
+    def reset(self, **kwargs):
+        self.env.reset(**kwargs)
+        obs, _, done, _ = self.env.step(1)
+        if done:
+            self.env.reset(**kwargs)
+        obs, _, done, _ = self.env.step(2)
+        if done:
+            self.env.reset(**kwargs)
+        return obs
+    def step(self, ac):
+        return self.env.step(ac)
+class EpisodicLifeEnv(gym.Wrapper):
+    def __init__(self, env):
+        """Make end-of-life == end-of-episode, but only reset on true game over.
+        Done by DeepMind for the DQN and co. since it helps value estimation.
+        """
+        gym.Wrapper.__init__(self, env)
+        self.lives = 0
+        self.was_real_done  = True
+    def step(self, action):
+        obs, reward, done, info = self.env.step(action)
+        self.was_real_done = done
+        # check current lives, make loss of life terminal,
+        # then update lives to handle bonus lives
+        lives = self.env.unwrapped.ale.lives()
+        if lives < self.lives and lives > 0:
+            # for Qbert sometimes we stay in lives == 0 condition for a few frames
+            # so it's important to keep lives > 0, so that we only reset once
+            # the environment advertises done.
+            done = True
+        self.lives = lives
+        return obs, reward, done, info
+    def reset(self, **kwargs):
+        """Reset only when lives are exhausted.
+        This way all states are still reachable even though lives are episodic,
+        and the learner need not know about any of this behind-the-scenes.
+        """
+        if self.was_real_done:
+            obs = self.env.reset(**kwargs)
+        else:
+            # no-op step to advance from terminal/lost life state
+            obs, _, _, _ = self.env.step(0)
+        self.lives = self.env.unwrapped.ale.lives()
+        return obs
+class MaxAndSkipEnv(gym.Wrapper):
+    def __init__(self, env, skip=4):
+        """Return only every `skip`-th frame"""
+        gym.Wrapper.__init__(self, env)
+        # most recent raw observations (for max pooling across time steps)
+        self._obs_buffer = np.zeros((2,)+env.observation_space.shape, dtype=np.uint8)
+        self._skip       = skip
+    def step(self, action):
+        """Repeat action, sum reward, and max over last observations."""
+        total_reward = 0.0
+        done = None
+        for i in range(self._skip):
+            obs, reward, done, info = self.env.step(action)
+            if i == self._skip - 2: self._obs_buffer[0] = obs
+            if i == self._skip - 1: self._obs_buffer[1] = obs
+            total_reward += reward
+            if done:
+                break
+        # Note that the observation on the done=True frame
+        # doesn't matter
+        max_frame = self._obs_buffer.max(axis=0)
+        return max_frame, total_reward, done, info
+    def reset(self, **kwargs):
+        return self.env.reset(**kwargs)
+class ClipRewardEnv(gym.RewardWrapper):
+    def __init__(self, env):
+        gym.RewardWrapper.__init__(self, env)
+    def reward(self, reward):
+        """Bin reward to {+1, 0, -1} by its sign."""
+        return np.sign(reward)
+class WarpFrame(gym.ObservationWrapper):
+    def __init__(self, env, width=84, height=84, grayscale=True, dict_space_key=None):
+        """
+        Warp frames to 84x84 as done in the Nature paper and later work.
+        If the environment uses dictionary observations, `dict_space_key` can be specified which indicates which
+        observation should be warped.
+        """
+        super().__init__(env)
+        self._width = width
+        self._height = height
+        self._grayscale = grayscale
+        self._key = dict_space_key
+        if self._grayscale:
+            num_colors = 1
+        else:
+            num_colors = 3
+        new_space = gym.spaces.Box(
+            low=0,
+            high=255,
+            shape=(self._height, self._width, num_colors),
+            dtype=np.uint8,
+        )
+        if self._key is None:
+            original_space = self.observation_space
+            self.observation_space = new_space
+        else:
+            original_space = self.observation_space.spaces[self._key]
+            self.observation_space.spaces[self._key] = new_space
+        assert original_space.dtype == np.uint8 and len(original_space.shape) == 3
+    def observation(self, obs):
+        if self._key is None:
+            frame = obs
+        else:
+            frame = obs[self._key]
+        if self._grayscale:
+            frame = cv2.cvtColor(frame, cv2.COLOR_RGB2GRAY)
+        frame = cv2.resize(
+            frame, (self._width, self._height), interpolation=cv2.INTER_AREA
+        )
+        if self._grayscale:
+            frame = np.expand_dims(frame, -1)
+        if self._key is None:
+            obs = frame
+        else:
+            obs = obs.copy()
+            obs[self._key] = frame
+        return obs
+class FrameStack(gym.Wrapper):
+    def __init__(self, env, k):
+        """Stack k last frames.
+        Returns lazy array, which is much more memory efficient.
+        See Also
+        --------
+        baselines.common.atari_wrappers.LazyFrames
+        """
+        gym.Wrapper.__init__(self, env)
+        self.k = k
+        self.frames = deque([], maxlen=k)
+        shp = env.observation_space.shape
+        self.observation_space = spaces.Box(low=0, high=255, shape=(shp[:-1] + (shp[-1] * k,)), dtype=env.observation_space.dtype)
+    def reset(self):
+        ob = self.env.reset()
+        for _ in range(self.k):
+            self.frames.append(ob)
+        return self._get_ob()
+    def step(self, action):
+        ob, reward, done, info = self.env.step(action)
+        self.frames.append(ob)
+        return self._get_ob(), reward, done, info
+    def _get_ob(self):
+        assert len(self.frames) == self.k
+        return LazyFrames(list(self.frames))
+class ScaledFloatFrame(gym.ObservationWrapper):
+    def __init__(self, env):
+        gym.ObservationWrapper.__init__(self, env)
+        self.observation_space = gym.spaces.Box(low=0, high=1, shape=env.observation_space.shape, dtype=np.float32)
+    def observation(self, observation):
+        # careful! This undoes the memory optimization, use
+        # with smaller replay buffers only.
+        return np.array(observation).astype(np.float32) / 255.0
+class LazyFrames(object):
+    def __init__(self, frames):
+        """This object ensures that common frames between the observations are only stored once.
+        It exists purely to optimize memory usage which can be huge for DQN's 1M frames replay
+        buffers.
+        This object should only be converted to numpy array before being passed to the model.
+        You'd not believe how complex the previous solution was."""
+        self._frames = frames
+        self._out = None
+    def _force(self):
+        if self._out is None:
+            self._out = np.concatenate(self._frames, axis=-1)
+            self._frames = None
+        return self._out
+    def __array__(self, dtype=None):
+        out = self._force()
+        if dtype is not None:
+            out = out.astype(dtype)
+        return out
+    def __len__(self):
+        return len(self._force())
+    def __getitem__(self, i):
+        return self._force()[i]
+    def count(self):
+        frames = self._force()
+        return frames.shape[frames.ndim - 1]
+    def frame(self, i):
+        return self._force()[..., i]
+def make_atari(env_id, max_episode_steps=None):
+    env = gym.make(env_id)
+    assert 'NoFrameskip' in env.spec.id
+    env = NoopResetEnv(env, noop_max=30)
+    env = MaxAndSkipEnv(env, skip=4)
+    if max_episode_steps is not None:
+        env = TimeLimit(env, max_episode_steps=max_episode_steps)
+    return env
+def wrap_deepmind(env, episode_life=True, clip_rewards=True, frame_stack=False, scale=False):
+    """Configure environment for DeepMind-style Atari.
+    """
+    if episode_life:
+        env = EpisodicLifeEnv(env)
+    if 'FIRE' in env.unwrapped.get_action_meanings():
+        env = FireResetEnv(env)
+    env = WarpFrame(env)
+    if scale:
+        env = ScaledFloatFrame(env)
+    if clip_rewards:
+        env = ClipRewardEnv(env)
+    if frame_stack:
+        env = FrameStack(env, 4)
+    return env

baselines/common/cg.py ADDED Viewed

	@@ -0,0 +1,34 @@

+import numpy as np
+def cg(f_Ax, b, cg_iters=10, callback=None, verbose=False, residual_tol=1e-10):
+    """
+    Demmel p 312
+    """
+    p = b.copy()
+    r = b.copy()
+    x = np.zeros_like(b)
+    rdotr = r.dot(r)
+    fmtstr =  "%10i %10.3g %10.3g"
+    titlestr =  "%10s %10s %10s"
+    if verbose: print(titlestr % ("iter", "residual norm", "soln norm"))
+    for i in range(cg_iters):
+        if callback is not None:
+            callback(x)
+        if verbose: print(fmtstr % (i, rdotr, np.linalg.norm(x)))
+        z = f_Ax(p)
+        v = rdotr / p.dot(z)
+        x += v*p
+        r -= v*z
+        newrdotr = r.dot(r)
+        mu = newrdotr/rdotr
+        p = r + mu*p
+        rdotr = newrdotr
+        if rdotr < residual_tol:
+            break
+    if callback is not None:
+        callback(x)
+    if verbose: print(fmtstr % (i+1, rdotr, np.linalg.norm(x)))  # pylint: disable=W0631
+    return x

baselines/common/cmd_util.py ADDED Viewed

	@@ -0,0 +1,206 @@

+"""
+Helpers for scripts like run_atari.py.
+"""
+import os
+try:
+    from mpi4py import MPI
+except ImportError:
+    MPI = None
+import gym
+from gym.wrappers import FlattenObservation, FilterObservation
+from baselines import logger
+from baselines.bench import Monitor
+from baselines.common import set_global_seeds
+from baselines.common.atari_wrappers import make_atari, wrap_deepmind
+from baselines.common.vec_env.subproc_vec_env import SubprocVecEnv
+from baselines.common.vec_env.dummy_vec_env import DummyVecEnv
+from baselines.common import retro_wrappers
+from baselines.common.wrappers import ClipActionsWrapper
+def make_vec_env(env_id, env_type, num_env, seed,
+                 wrapper_kwargs=None,
+                 env_kwargs=None,
+                 start_index=0,
+                 reward_scale=1.0,
+                 flatten_dict_observations=True,
+                 gamestate=None,
+                 initializer=None,
+                 force_dummy=False):
+    """
+    Create a wrapped, monitored SubprocVecEnv for Atari and MuJoCo.
+    """
+    wrapper_kwargs = wrapper_kwargs or {}
+    env_kwargs = env_kwargs or {}
+    mpi_rank = MPI.COMM_WORLD.Get_rank() if MPI else 0
+    seed = seed + 10000 * mpi_rank if seed is not None else None
+    logger_dir = logger.get_dir()
+    def make_thunk(rank, initializer=None):
+        return lambda: make_env(
+            env_id=env_id,
+            env_type=env_type,
+            mpi_rank=mpi_rank,
+            subrank=rank,
+            seed=seed,
+            reward_scale=reward_scale,
+            gamestate=gamestate,
+            flatten_dict_observations=flatten_dict_observations,
+            wrapper_kwargs=wrapper_kwargs,
+            env_kwargs=env_kwargs,
+            logger_dir=logger_dir,
+            initializer=initializer
+        )
+    set_global_seeds(seed)
+    if not force_dummy and num_env > 1:
+        return SubprocVecEnv([make_thunk(i + start_index, initializer=initializer) for i in range(num_env)])
+    else:
+        return DummyVecEnv([make_thunk(i + start_index, initializer=None) for i in range(num_env)])
+def make_env(env_id, env_type, mpi_rank=0, subrank=0, seed=None, reward_scale=1.0, gamestate=None, flatten_dict_observations=True, wrapper_kwargs=None, env_kwargs=None, logger_dir=None, initializer=None):
+    if initializer is not None:
+        initializer(mpi_rank=mpi_rank, subrank=subrank)
+    wrapper_kwargs = wrapper_kwargs or {}
+    env_kwargs = env_kwargs or {}
+    if ':' in env_id:
+        import re
+        import importlib
+        module_name = re.sub(':.*','',env_id)
+        env_id = re.sub('.*:', '', env_id)
+        importlib.import_module(module_name)
+    if env_type == 'atari':
+        env = make_atari(env_id)
+    elif env_type == 'retro':
+        import retro
+        gamestate = gamestate or retro.State.DEFAULT
+        env = retro_wrappers.make_retro(game=env_id, max_episode_steps=10000, use_restricted_actions=retro.Actions.DISCRETE, state=gamestate)
+    else:
+        env = gym.make(env_id, **env_kwargs)
+    if flatten_dict_observations and isinstance(env.observation_space, gym.spaces.Dict):
+        env = FlattenObservation(env)
+    env.seed(seed + subrank if seed is not None else None)
+    env = Monitor(env,
+                  logger_dir and os.path.join(logger_dir, str(mpi_rank) + '.' + str(subrank)),
+                  allow_early_resets=True)
+    if env_type == 'atari':
+        env = wrap_deepmind(env, **wrapper_kwargs)
+    elif env_type == 'retro':
+        if 'frame_stack' not in wrapper_kwargs:
+            wrapper_kwargs['frame_stack'] = 1
+        env = retro_wrappers.wrap_deepmind_retro(env, **wrapper_kwargs)
+    if isinstance(env.action_space, gym.spaces.Box):
+        env = ClipActionsWrapper(env)
+    if reward_scale != 1:
+        env = retro_wrappers.RewardScaler(env, reward_scale)
+    return env
+def make_mujoco_env(env_id, seed, reward_scale=1.0):
+    """
+    Create a wrapped, monitored gym.Env for MuJoCo.
+    """
+    rank = MPI.COMM_WORLD.Get_rank()
+    myseed = seed  + 1000 * rank if seed is not None else None
+    set_global_seeds(myseed)
+    env = gym.make(env_id)
+    logger_path = None if logger.get_dir() is None else os.path.join(logger.get_dir(), str(rank))
+    env = Monitor(env, logger_path, allow_early_resets=True)
+    env.seed(seed)
+    if reward_scale != 1.0:
+        from baselines.common.retro_wrappers import RewardScaler
+        env = RewardScaler(env, reward_scale)
+    return env
+def make_robotics_env(env_id, seed, rank=0):
+    """
+    Create a wrapped, monitored gym.Env for MuJoCo.
+    """
+    set_global_seeds(seed)
+    env = gym.make(env_id)
+    env = FlattenObservation(FilterObservation(env, ['observation', 'desired_goal']))
+    env = Monitor(
+        env, logger.get_dir() and os.path.join(logger.get_dir(), str(rank)),
+        info_keywords=('is_success',))
+    env.seed(seed)
+    return env
+def arg_parser():
+    """
+    Create an empty argparse.ArgumentParser.
+    """
+    import argparse
+    return argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+def atari_arg_parser():
+    """
+    Create an argparse.ArgumentParser for run_atari.py.
+    """
+    print('Obsolete - use common_arg_parser instead')
+    return common_arg_parser()
+def mujoco_arg_parser():
+    print('Obsolete - use common_arg_parser instead')
+    return common_arg_parser()
+def common_arg_parser():
+    """
+    Create an argparse.ArgumentParser for run_mujoco.py.
+    """
+    parser = arg_parser()
+    parser.add_argument('--env', help='environment ID', type=str, default='Reacher-v2')
+    parser.add_argument('--env_type', help='type of environment, used when the environment type cannot be automatically determined', type=str)
+    parser.add_argument('--seed', help='RNG seed', type=int, default=None)
+    parser.add_argument('--alg', help='Algorithm', type=str, default='ppo2')
+    parser.add_argument('--num_timesteps', type=float, default=1e6),
+    parser.add_argument('--network', help='network type (mlp, cnn, lstm, cnn_lstm, conv_only)', default=None)
+    parser.add_argument('--gamestate', help='game state to load (so far only used in retro games)', default=None)
+    parser.add_argument('--num_env', help='Number of environment copies being run in parallel. When not specified, set to number of cpus for Atari, and to 1 for Mujoco', default=None, type=int)
+    parser.add_argument('--reward_scale', help='Reward scale factor. Default: 1.0', default=1.0, type=float)
+    parser.add_argument('--save_path', help='Path to save trained model to', default=None, type=str)
+    parser.add_argument('--save_video_interval', help='Save video every x steps (0 = disabled)', default=0, type=int)
+    parser.add_argument('--save_video_length', help='Length of recorded video. Default: 200', default=200, type=int)
+    parser.add_argument('--log_path', help='Directory to save learning curve data.', default=None, type=str)
+    parser.add_argument('--play', default=False, action='store_true')
+    return parser
+def robotics_arg_parser():
+    """
+    Create an argparse.ArgumentParser for run_mujoco.py.
+    """
+    parser = arg_parser()
+    parser.add_argument('--env', help='environment ID', type=str, default='FetchReach-v0')
+    parser.add_argument('--seed', help='RNG seed', type=int, default=None)
+    parser.add_argument('--num-timesteps', type=int, default=int(1e6))
+    return parser
+def parse_unknown_args(args):
+    """
+    Parse arguments not consumed by arg parser into a dictionary
+    """
+    retval = {}
+    preceded_by_key = False
+    for arg in args:
+        if arg.startswith('--'):
+            if '=' in arg:
+                key = arg.split('=')[0][2:]
+                value = arg.split('=')[1]
+                retval[key] = value
+            else:
+                key = arg[2:]
+                preceded_by_key = True
+        elif preceded_by_key:
+            retval[key] = arg
+            preceded_by_key = False
+    return retval

baselines/common/console_util.py ADDED Viewed

	@@ -0,0 +1,80 @@

+from __future__ import print_function
+from contextlib import contextmanager
+import numpy as np
+import time
+import shlex
+import subprocess
+# ================================================================
+# Misc
+# ================================================================
+def fmt_row(width, row, header=False):
+    out = " | ".join(fmt_item(x, width) for x in row)
+    if header: out = out + "\n" + "-"*len(out)
+    return out
+def fmt_item(x, l):
+    if isinstance(x, np.ndarray):
+        assert x.ndim==0
+        x = x.item()
+    if isinstance(x, (float, np.float32, np.float64)):
+        v = abs(x)
+        if (v < 1e-4 or v > 1e+4) and v > 0:
+            rep = "%7.2e" % x
+        else:
+            rep = "%7.5f" % x
+    else: rep = str(x)
+    return " "*(l - len(rep)) + rep
+color2num = dict(
+    gray=30,
+    red=31,
+    green=32,
+    yellow=33,
+    blue=34,
+    magenta=35,
+    cyan=36,
+    white=37,
+    crimson=38
+)
+def colorize(string, color='green', bold=False, highlight=False):
+    attr = []
+    num = color2num[color]
+    if highlight: num += 10
+    attr.append(str(num))
+    if bold: attr.append('1')
+    return '\x1b[%sm%s\x1b[0m' % (';'.join(attr), string)
+def print_cmd(cmd, dry=False):
+    if isinstance(cmd, str):  # for shell=True
+        pass
+    else:
+        cmd = ' '.join(shlex.quote(arg) for arg in cmd)
+    print(colorize(('CMD: ' if not dry else 'DRY: ') + cmd))
+def get_git_commit(cwd=None):
+    return subprocess.check_output(['git', 'rev-parse', '--short', 'HEAD'], cwd=cwd).decode('utf8')
+def get_git_commit_message(cwd=None):
+    return subprocess.check_output(['git', 'show', '-s', '--format=%B', 'HEAD'], cwd=cwd).decode('utf8')
+def ccap(cmd, dry=False, env=None, **kwargs):
+    print_cmd(cmd, dry)
+    if not dry:
+        subprocess.check_call(cmd, env=env, **kwargs)
+MESSAGE_DEPTH = 0
+@contextmanager
+def timed(msg):
+    global MESSAGE_DEPTH #pylint: disable=W0603
+    print(colorize('\t'*MESSAGE_DEPTH + '=: ' + msg, color='magenta'))
+    tstart = time.time()
+    MESSAGE_DEPTH += 1
+    yield
+    MESSAGE_DEPTH -= 1
+    print(colorize('\t'*MESSAGE_DEPTH + "done in %.3f seconds"%(time.time() - tstart), color='magenta'))

baselines/common/dataset.py ADDED Viewed

	@@ -0,0 +1,60 @@

+import numpy as np
+class Dataset(object):
+    def __init__(self, data_map, deterministic=False, shuffle=True):
+        self.data_map = data_map
+        self.deterministic = deterministic
+        self.enable_shuffle = shuffle
+        self.n = next(iter(data_map.values())).shape[0]
+        self._next_id = 0
+        self.shuffle()
+    def shuffle(self):
+        if self.deterministic:
+            return
+        perm = np.arange(self.n)
+        np.random.shuffle(perm)
+        for key in self.data_map:
+            self.data_map[key] = self.data_map[key][perm]
+        self._next_id = 0
+    def next_batch(self, batch_size):
+        if self._next_id >= self.n and self.enable_shuffle:
+            self.shuffle()
+        cur_id = self._next_id
+        cur_batch_size = min(batch_size, self.n - self._next_id)
+        self._next_id += cur_batch_size
+        data_map = dict()
+        for key in self.data_map:
+            data_map[key] = self.data_map[key][cur_id:cur_id+cur_batch_size]
+        return data_map
+    def iterate_once(self, batch_size):
+        if self.enable_shuffle: self.shuffle()
+        while self._next_id <= self.n - batch_size:
+            yield self.next_batch(batch_size)
+        self._next_id = 0
+    def subset(self, num_elements, deterministic=True):
+        data_map = dict()
+        for key in self.data_map:
+            data_map[key] = self.data_map[key][:num_elements]
+        return Dataset(data_map, deterministic)
+def iterbatches(arrays, *, num_batches=None, batch_size=None, shuffle=True, include_final_partial_batch=True):
+    assert (num_batches is None) != (batch_size is None), 'Provide num_batches or batch_size, but not both'
+    arrays = tuple(map(np.asarray, arrays))
+    n = arrays[0].shape[0]
+    assert all(a.shape[0] == n for a in arrays[1:])
+    inds = np.arange(n)
+    if shuffle: np.random.shuffle(inds)
+    sections = np.arange(0, n, batch_size)[1:] if num_batches is None else num_batches
+    for batch_inds in np.array_split(inds, sections):
+        if include_final_partial_batch or len(batch_inds) == batch_size:
+            yield tuple(a[batch_inds] for a in arrays)

baselines/common/distributions.py ADDED Viewed

	@@ -0,0 +1,355 @@

+import tensorflow as tf
+import numpy as np
+import baselines.common.tf_util as U
+from baselines.a2c.utils import fc
+from tensorflow.python.ops import math_ops
+class Pd(object):
+    """
+    A particular probability distribution
+    """
+    def flatparam(self):
+        raise NotImplementedError
+    def mode(self):
+        raise NotImplementedError
+    def neglogp(self, x):
+        # Usually it's easier to define the negative logprob
+        raise NotImplementedError
+    def kl(self, other):
+        raise NotImplementedError
+    def entropy(self):
+        raise NotImplementedError
+    def sample(self):
+        raise NotImplementedError
+    def logp(self, x):
+        return - self.neglogp(x)
+    def get_shape(self):
+        return self.flatparam().shape
+    @property
+    def shape(self):
+        return self.get_shape()
+    def __getitem__(self, idx):
+        return self.__class__(self.flatparam()[idx])
+class PdType(object):
+    """
+    Parametrized family of probability distributions
+    """
+    def pdclass(self):
+        raise NotImplementedError
+    def pdfromflat(self, flat):
+        return self.pdclass()(flat)
+    def pdfromlatent(self, latent_vector, init_scale, init_bias):
+        raise NotImplementedError
+    def param_shape(self):
+        raise NotImplementedError
+    def sample_shape(self):
+        raise NotImplementedError
+    def sample_dtype(self):
+        raise NotImplementedError
+    def param_placeholder(self, prepend_shape, name=None):
+        return tf.compat.v1.placeholder(dtype=tf.float32, shape=prepend_shape+self.param_shape(), name=name)
+    def sample_placeholder(self, prepend_shape, name=None):
+        return tf.compat.v1.placeholder(dtype=self.sample_dtype(), shape=prepend_shape+self.sample_shape(), name=name)
+    def __eq__(self, other):
+        return (type(self) == type(other)) and (self.__dict__ == other.__dict__)
+class CategoricalPdType(PdType):
+    def __init__(self, ncat):
+        self.ncat = ncat
+    def pdclass(self):
+        return CategoricalPd
+    def pdfromlatent(self, latent_vector, init_scale=1.0, init_bias=0.0):
+        pdparam = _matching_fc(latent_vector, 'pi', self.ncat, init_scale=init_scale, init_bias=init_bias)
+        return self.pdfromflat(pdparam), pdparam
+    def param_shape(self):
+        return [self.ncat]
+    def sample_shape(self):
+        return []
+    def sample_dtype(self):
+        return tf.int32
+class MultiCategoricalPdType(PdType):
+    def __init__(self, nvec):
+        self.ncats = nvec.astype('int32')
+        assert (self.ncats > 0).all()
+    def pdclass(self):
+        return MultiCategoricalPd
+    def pdfromflat(self, flat):
+        return MultiCategoricalPd(self.ncats, flat)
+    def pdfromlatent(self, latent, init_scale=1.0, init_bias=0.0):
+        pdparam = _matching_fc(latent, 'pi', self.ncats.sum(), init_scale=init_scale, init_bias=init_bias)
+        return self.pdfromflat(pdparam), pdparam
+    def param_shape(self):
+        return [sum(self.ncats)]
+    def sample_shape(self):
+        return [len(self.ncats)]
+    def sample_dtype(self):
+        return tf.int32
+class DiagGaussianPdType(PdType):
+    def __init__(self, size):
+        self.size = size
+    def pdclass(self):
+        return DiagGaussianPd
+    def pdfromlatent(self, latent_vector, init_scale=1.0, init_bias=0.0):
+        mean = _matching_fc(latent_vector, 'pi', self.size, init_scale=init_scale, init_bias=init_bias)
+        logstd = tf.compat.v1.get_variable(name='pi/logstd', shape=[1, self.size], initializer=tf.compat.v1.zeros_initializer())
+        pdparam = tf.concat([mean, mean * 0.0 + logstd], axis=1)
+        return self.pdfromflat(pdparam), mean
+    def param_shape(self):
+        return [2*self.size]
+    def sample_shape(self):
+        return [self.size]
+    def sample_dtype(self):
+        return tf.float32
+class BernoulliPdType(PdType):
+    def __init__(self, size):
+        self.size = size
+    def pdclass(self):
+        return BernoulliPd
+    def param_shape(self):
+        return [self.size]
+    def sample_shape(self):
+        return [self.size]
+    def sample_dtype(self):
+        return tf.int32
+    def pdfromlatent(self, latent_vector, init_scale=1.0, init_bias=0.0):
+        pdparam = _matching_fc(latent_vector, 'pi', self.size, init_scale=init_scale, init_bias=init_bias)
+        return self.pdfromflat(pdparam), pdparam
+# WRONG SECOND DERIVATIVES
+# class CategoricalPd(Pd):
+#     def __init__(self, logits):
+#         self.logits = logits
+#         self.ps = tf.nn.softmax(logits)
+#     @classmethod
+#     def fromflat(cls, flat):
+#         return cls(flat)
+#     def flatparam(self):
+#         return self.logits
+#     def mode(self):
+#         return U.argmax(self.logits, axis=-1)
+#     def logp(self, x):
+#         return -tf.nn.sparse_softmax_cross_entropy_with_logits(self.logits, x)
+#     def kl(self, other):
+#         return tf.nn.softmax_cross_entropy_with_logits(other.logits, self.ps) \
+#                 - tf.nn.softmax_cross_entropy_with_logits(self.logits, self.ps)
+#     def entropy(self):
+#         return tf.nn.softmax_cross_entropy_with_logits(self.logits, self.ps)
+#     def sample(self):
+#         u = tf.random_uniform(tf.shape(self.logits))
+#         return U.argmax(self.logits - tf.log(-tf.log(u)), axis=-1)
+class CategoricalPd(Pd):
+    def __init__(self, logits):
+        self.logits = logits
+    def flatparam(self):
+        return self.logits
+    def mode(self):
+        return tf.argmax(input=self.logits, axis=-1)
+    @property
+    def mean(self):
+        return tf.nn.softmax(self.logits)
+    def neglogp(self, x):
+        # return tf.nn.sparse_softmax_cross_entropy_with_logits(logits=self.logits, labels=x)
+        # Note: we can't use sparse_softmax_cross_entropy_with_logits because
+        #       the implementation does not allow second-order derivatives...
+        if x.dtype in {tf.uint8, tf.int32, tf.int64}:
+            # one-hot encoding
+            x_shape_list = x.shape.as_list()
+            logits_shape_list = self.logits.get_shape().as_list()[:-1]
+            for xs, ls in zip(x_shape_list, logits_shape_list):
+                if xs is not None and ls is not None:
+                    assert xs == ls, 'shape mismatch: {} in x vs {} in logits'.format(xs, ls)
+            x = tf.one_hot(x, self.logits.get_shape().as_list()[-1])
+        else:
+            # already encoded
+            assert x.shape.as_list() == self.logits.shape.as_list()
+        return tf.nn.softmax_cross_entropy_with_logits(
+            logits=self.logits,
+            labels=x)
+    def kl(self, other):
+        a0 = self.logits - tf.reduce_max(input_tensor=self.logits, axis=-1, keepdims=True)
+        a1 = other.logits - tf.reduce_max(input_tensor=other.logits, axis=-1, keepdims=True)
+        ea0 = tf.exp(a0)
+        ea1 = tf.exp(a1)
+        z0 = tf.reduce_sum(input_tensor=ea0, axis=-1, keepdims=True)
+        z1 = tf.reduce_sum(input_tensor=ea1, axis=-1, keepdims=True)
+        p0 = ea0 / z0
+        return tf.reduce_sum(input_tensor=p0 * (a0 - tf.math.log(z0) - a1 + tf.math.log(z1)), axis=-1)
+    def entropy(self):
+        a0 = self.logits - tf.reduce_max(input_tensor=self.logits, axis=-1, keepdims=True)
+        ea0 = tf.exp(a0)
+        z0 = tf.reduce_sum(input_tensor=ea0, axis=-1, keepdims=True)
+        p0 = ea0 / z0
+        return tf.reduce_sum(input_tensor=p0 * (tf.math.log(z0) - a0), axis=-1)
+    def sample(self):
+        u = tf.random.uniform(tf.shape(input=self.logits), dtype=self.logits.dtype)
+        return tf.argmax(input=self.logits - tf.math.log(-tf.math.log(u)), axis=-1)
+    @classmethod
+    def fromflat(cls, flat):
+        return cls(flat)
+class MultiCategoricalPd(Pd):
+    def __init__(self, nvec, flat):
+        self.flat = flat
+        self.categoricals = list(map(CategoricalPd,
+            tf.split(flat, np.array(nvec, dtype=np.int32), axis=-1)))
+    def flatparam(self):
+        return self.flat
+    def mode(self):
+        return tf.cast(tf.stack([p.mode() for p in self.categoricals], axis=-1), tf.int32)
+    def neglogp(self, x):
+        return tf.add_n([p.neglogp(px) for p, px in zip(self.categoricals, tf.unstack(x, axis=-1))])
+    def kl(self, other):
+        return tf.add_n([p.kl(q) for p, q in zip(self.categoricals, other.categoricals)])
+    def entropy(self):
+        return tf.add_n([p.entropy() for p in self.categoricals])
+    def sample(self):
+        return tf.cast(tf.stack([p.sample() for p in self.categoricals], axis=-1), tf.int32)
+    @classmethod
+    def fromflat(cls, flat):
+        raise NotImplementedError
+class DiagGaussianPd(Pd):
+    def __init__(self, flat):
+        self.flat = flat
+        mean, logstd = tf.split(axis=len(flat.shape)-1, num_or_size_splits=2, value=flat)
+        self.mean = mean
+        self.logstd = logstd
+        self.std = tf.exp(logstd)
+    def flatparam(self):
+        return self.flat
+    def mode(self):
+        return self.mean
+    def neglogp(self, x):
+        return 0.5 * tf.reduce_sum(input_tensor=tf.square((x - self.mean) / self.std), axis=-1) \
+               + 0.5 * np.log(2.0 * np.pi) * tf.cast(tf.shape(input=x)[-1], dtype=tf.float32) \
+               + tf.reduce_sum(input_tensor=self.logstd, axis=-1)
+    def kl(self, other):
+        assert isinstance(other, DiagGaussianPd)
+        return tf.reduce_sum(input_tensor=other.logstd - self.logstd + (tf.square(self.std) + tf.square(self.mean - other.mean)) / (2.0 * tf.square(other.std)) - 0.5, axis=-1)
+    def entropy(self):
+        return tf.reduce_sum(input_tensor=self.logstd + .5 * np.log(2.0 * np.pi * np.e), axis=-1)
+    def sample(self):
+        return self.mean + self.std * tf.random.normal(tf.shape(input=self.mean))
+    @classmethod
+    def fromflat(cls, flat):
+        return cls(flat)
+class BernoulliPd(Pd):
+    def __init__(self, logits):
+        self.logits = logits
+        self.ps = tf.sigmoid(logits)
+    def flatparam(self):
+        return self.logits
+    @property
+    def mean(self):
+        return self.ps
+    def mode(self):
+        return tf.round(self.ps)
+    def neglogp(self, x):
+        return tf.reduce_sum(input_tensor=tf.nn.sigmoid_cross_entropy_with_logits(logits=self.logits, labels=tf.cast(x, dtype=tf.float32)), axis=-1)
+    def kl(self, other):
+        return tf.reduce_sum(input_tensor=tf.nn.sigmoid_cross_entropy_with_logits(logits=other.logits, labels=self.ps), axis=-1) - tf.reduce_sum(input_tensor=tf.nn.sigmoid_cross_entropy_with_logits(logits=self.logits, labels=self.ps), axis=-1)
+    def entropy(self):
+        return tf.reduce_sum(input_tensor=tf.nn.sigmoid_cross_entropy_with_logits(logits=self.logits, labels=self.ps), axis=-1)
+    def sample(self):
+        u = tf.random.uniform(tf.shape(input=self.ps))
+        return tf.cast(math_ops.less(u, self.ps), dtype=tf.float32)
+    @classmethod
+    def fromflat(cls, flat):
+        return cls(flat)
+def make_pdtype(ac_space):
+    from gym import spaces
+    if isinstance(ac_space, spaces.Box):
+        assert len(ac_space.shape) == 1
+        return DiagGaussianPdType(ac_space.shape[0])
+    elif isinstance(ac_space, spaces.Discrete):
+        return CategoricalPdType(ac_space.n)
+    elif isinstance(ac_space, spaces.MultiDiscrete):
+        return MultiCategoricalPdType(ac_space.nvec)
+    elif isinstance(ac_space, spaces.MultiBinary):
+        return BernoulliPdType(ac_space.n)
+    else:
+        raise NotImplementedError
+def shape_el(v, i):
+    maybe = v.get_shape()[i]
+    if maybe is not None:
+        return maybe
+    else:
+        return tf.shape(input=v)[i]
+@U.in_session
+def test_probtypes():
+    np.random.seed(0)
+    pdparam_diag_gauss = np.array([-.2, .3, .4, -.5, .1, -.5, .1, 0.8])
+    diag_gauss = DiagGaussianPdType(pdparam_diag_gauss.size // 2) #pylint: disable=E1101
+    validate_probtype(diag_gauss, pdparam_diag_gauss)
+    pdparam_categorical = np.array([-.2, .3, .5])
+    categorical = CategoricalPdType(pdparam_categorical.size) #pylint: disable=E1101
+    validate_probtype(categorical, pdparam_categorical)
+    nvec = [1,2,3]
+    pdparam_multicategorical = np.array([-.2, .3, .5, .1, 1, -.1])
+    multicategorical = MultiCategoricalPdType(nvec) #pylint: disable=E1101
+    validate_probtype(multicategorical, pdparam_multicategorical)
+    pdparam_bernoulli = np.array([-.2, .3, .5])
+    bernoulli = BernoulliPdType(pdparam_bernoulli.size) #pylint: disable=E1101
+    validate_probtype(bernoulli, pdparam_bernoulli)
+def validate_probtype(probtype, pdparam):
+    N = 100000
+    # Check to see if mean negative log likelihood == differential entropy
+    Mval = np.repeat(pdparam[None, :], N, axis=0)
+    M = probtype.param_placeholder([N])
+    X = probtype.sample_placeholder([N])
+    pd = probtype.pdfromflat(M)
+    calcloglik = U.function([X, M], pd.logp(X))
+    calcent = U.function([M], pd.entropy())
+    Xval = tf.compat.v1.get_default_session().run(pd.sample(), feed_dict={M:Mval})
+    logliks = calcloglik(Xval, Mval)
+    entval_ll = - logliks.mean() #pylint: disable=E1101
+    entval_ll_stderr = logliks.std() / np.sqrt(N) #pylint: disable=E1101
+    entval = calcent(Mval).mean() #pylint: disable=E1101
+    assert np.abs(entval - entval_ll) < 3 * entval_ll_stderr # within 3 sigmas
+    # Check to see if kldiv[p,q] = - ent[p] - E_p[log q]
+    M2 = probtype.param_placeholder([N])
+    pd2 = probtype.pdfromflat(M2)
+    q = pdparam + np.random.randn(pdparam.size) * 0.1
+    Mval2 = np.repeat(q[None, :], N, axis=0)
+    calckl = U.function([M, M2], pd.kl(pd2))
+    klval = calckl(Mval, Mval2).mean() #pylint: disable=E1101
+    logliks = calcloglik(Xval, Mval2)
+    klval_ll = - entval - logliks.mean() #pylint: disable=E1101
+    klval_ll_stderr = logliks.std() / np.sqrt(N) #pylint: disable=E1101
+    assert np.abs(klval - klval_ll) < 3 * klval_ll_stderr # within 3 sigmas
+    print('ok on', probtype, pdparam)
+def _matching_fc(tensor, name, size, init_scale, init_bias):
+    if tensor.shape[-1] == size:
+        return tensor
+    else:
+        return fc(tensor, name, size, init_scale=init_scale, init_bias=init_bias)

baselines/common/input.py ADDED Viewed

	@@ -0,0 +1,65 @@

+import numpy as np
+import tensorflow as tf
+tf.compat.v1.disable_eager_execution()
+from gym.spaces import Discrete, Box, MultiDiscrete
+def observation_placeholder(ob_space, batch_size=None, name='Ob'):
+    '''
+    Create placeholder to feed observations into of the size appropriate to the observation space
+    Parameters:
+    ----------
+    ob_space: gym.Space     observation space
+    batch_size: int         size of the batch to be fed into input. Can be left None in most cases.
+    name: str               name of the placeholder
+    Returns:
+    -------
+    tensorflow placeholder tensor
+    '''
+    assert isinstance(ob_space, Discrete) or isinstance(ob_space, Box) or isinstance(ob_space, MultiDiscrete), \
+        'Can only deal with Discrete and Box observation spaces for now'
+    dtype = ob_space.dtype
+    if dtype == np.int8:
+        dtype = np.uint8
+    return tf.compat.v1.placeholder(shape=(batch_size,) + ob_space.shape, dtype=dtype, name=name)
+def observation_input(ob_space, batch_size=None, name='Ob'):
+    '''
+    Create placeholder to feed observations into of the size appropriate to the observation space, and add input
+    encoder of the appropriate type.
+    '''
+    placeholder = observation_placeholder(ob_space, batch_size, name)
+    return placeholder, encode_observation(ob_space, placeholder)
+def encode_observation(ob_space, placeholder):
+    '''
+    Encode input in the way that is appropriate to the observation space
+    Parameters:
+    ----------
+    ob_space: gym.Space             observation space
+    placeholder: tf.placeholder     observation input placeholder
+    '''
+    if isinstance(ob_space, Discrete):
+        return tf.cast(tf.one_hot(placeholder, ob_space.n), dtype=tf.float32)
+    elif isinstance(ob_space, Box):
+        return tf.cast(placeholder, dtype=tf.float32)
+    elif isinstance(ob_space, MultiDiscrete):
+        placeholder = tf.cast(placeholder, tf.int32)
+        one_hots = [tf.cast(tf.one_hot(placeholder[..., i], ob_space.nvec[i]), dtype=tf.float32) for i in range(placeholder.shape[-1])]
+        return tf.concat(one_hots, axis=-1)
+    else:
+        raise NotImplementedError

baselines/common/math_util.py ADDED Viewed

	@@ -0,0 +1,85 @@

+import numpy as np
+import scipy.signal
+def discount(x, gamma):
+    """
+    computes discounted sums along 0th dimension of x.
+    inputs
+    ------
+    x: ndarray
+    gamma: float
+    outputs
+    -------
+    y: ndarray with same shape as x, satisfying
+        y[t] = x[t] + gamma*x[t+1] + gamma^2*x[t+2] + ... + gamma^k x[t+k],
+                where k = len(x) - t - 1
+    """
+    assert x.ndim >= 1
+    return scipy.signal.lfilter([1],[1,-gamma],x[::-1], axis=0)[::-1]
+def explained_variance(ypred,y):
+    """
+    Computes fraction of variance that ypred explains about y.
+    Returns 1 - Var[y-ypred] / Var[y]
+    interpretation:
+        ev=0  =>  might as well have predicted zero
+        ev=1  =>  perfect prediction
+        ev<0  =>  worse than just predicting zero
+    """
+    assert y.ndim == 1 and ypred.ndim == 1
+    vary = np.var(y)
+    return np.nan if vary==0 else 1 - np.var(y-ypred)/vary
+def explained_variance_2d(ypred, y):
+    assert y.ndim == 2 and ypred.ndim == 2
+    vary = np.var(y, axis=0)
+    out = 1 - np.var(y-ypred)/vary
+    out[vary < 1e-10] = 0
+    return out
+def ncc(ypred, y):
+    return np.corrcoef(ypred, y)[1,0]
+def flatten_arrays(arrs):
+    return np.concatenate([arr.flat for arr in arrs])
+def unflatten_vector(vec, shapes):
+    i=0
+    arrs = []
+    for shape in shapes:
+        size = np.prod(shape)
+        arr = vec[i:i+size].reshape(shape)
+        arrs.append(arr)
+        i += size
+    return arrs
+def discount_with_boundaries(X, New, gamma):
+    """
+    X: 2d array of floats, time x features
+    New: 2d array of bools, indicating when a new episode has started
+    """
+    Y = np.zeros_like(X)
+    T = X.shape[0]
+    Y[T-1] = X[T-1]
+    for t in range(T-2, -1, -1):
+        Y[t] = X[t] + gamma * Y[t+1] * (1 - New[t+1])
+    return Y
+def test_discount_with_boundaries():
+    gamma=0.9
+    x = np.array([1.0, 2.0, 3.0, 4.0], 'float32')
+    starts = [1.0, 0.0, 0.0, 1.0]
+    y = discount_with_boundaries(x, starts, gamma)
+    assert np.allclose(y, [
+        1 + gamma * 2 + gamma**2 * 3,
+        2 + gamma * 3,
+        3,
+        4
+    ])

baselines/common/misc_util.py ADDED Viewed

	@@ -0,0 +1,243 @@

+import gym
+import numpy as np
+import os
+import pickle
+import random
+import tempfile
+import zipfile
+def zipsame(*seqs):
+    L = len(seqs[0])
+    assert all(len(seq) == L for seq in seqs[1:])
+    return zip(*seqs)
+class EzPickle(object):
+    """Objects that are pickled and unpickled via their constructor
+    arguments.
+    Example usage:
+        class Dog(Animal, EzPickle):
+            def __init__(self, furcolor, tailkind="bushy"):
+                Animal.__init__()
+                EzPickle.__init__(furcolor, tailkind)
+                ...
+    When this object is unpickled, a new Dog will be constructed by passing the provided
+    furcolor and tailkind into the constructor. However, philosophers are still not sure
+    whether it is still the same dog.
+    This is generally needed only for environments which wrap C/C++ code, such as MuJoCo
+    and Atari.
+    """
+    def __init__(self, *args, **kwargs):
+        self._ezpickle_args = args
+        self._ezpickle_kwargs = kwargs
+    def __getstate__(self):
+        return {"_ezpickle_args": self._ezpickle_args, "_ezpickle_kwargs": self._ezpickle_kwargs}
+    def __setstate__(self, d):
+        out = type(self)(*d["_ezpickle_args"], **d["_ezpickle_kwargs"])
+        self.__dict__.update(out.__dict__)
+def set_global_seeds(i):
+    try:
+        import MPI
+        rank = MPI.COMM_WORLD.Get_rank()
+    except ImportError:
+        rank = 0
+    myseed = i  + 1000 * rank if i is not None else None
+    try:
+        import tensorflow as tf
+        tf.compat.v1.set_random_seed(myseed)
+    except ImportError:
+        pass
+    np.random.seed(myseed)
+    random.seed(myseed)
+def pretty_eta(seconds_left):
+    """Print the number of seconds in human readable format.
+    Examples:
+    2 days
+    2 hours and 37 minutes
+    less than a minute
+    Paramters
+    ---------
+    seconds_left: int
+        Number of seconds to be converted to the ETA
+    Returns
+    -------
+    eta: str
+        String representing the pretty ETA.
+    """
+    minutes_left = seconds_left // 60
+    seconds_left %= 60
+    hours_left = minutes_left // 60
+    minutes_left %= 60
+    days_left = hours_left // 24
+    hours_left %= 24
+    def helper(cnt, name):
+        return "{} {}{}".format(str(cnt), name, ('s' if cnt > 1 else ''))
+    if days_left > 0:
+        msg = helper(days_left, 'day')
+        if hours_left > 0:
+            msg += ' and ' + helper(hours_left, 'hour')
+        return msg
+    if hours_left > 0:
+        msg = helper(hours_left, 'hour')
+        if minutes_left > 0:
+            msg += ' and ' + helper(minutes_left, 'minute')
+        return msg
+    if minutes_left > 0:
+        return helper(minutes_left, 'minute')
+    return 'less than a minute'
+class RunningAvg(object):
+    def __init__(self, gamma, init_value=None):
+        """Keep a running estimate of a quantity. This is a bit like mean
+        but more sensitive to recent changes.
+        Parameters
+        ----------
+        gamma: float
+            Must be between 0 and 1, where 0 is the most sensitive to recent
+            changes.
+        init_value: float or None
+            Initial value of the estimate. If None, it will be set on the first update.
+        """
+        self._value = init_value
+        self._gamma = gamma
+    def update(self, new_val):
+        """Update the estimate.
+        Parameters
+        ----------
+        new_val: float
+            new observated value of estimated quantity.
+        """
+        if self._value is None:
+            self._value = new_val
+        else:
+            self._value = self._gamma * self._value + (1.0 - self._gamma) * new_val
+    def __float__(self):
+        """Get the current estimate"""
+        return self._value
+def boolean_flag(parser, name, default=False, help=None):
+    """Add a boolean flag to argparse parser.
+    Parameters
+    ----------
+    parser: argparse.Parser
+        parser to add the flag to
+    name: str
+        --<name> will enable the flag, while --no-<name> will disable it
+    default: bool or None
+        default value of the flag
+    help: str
+        help string for the flag
+    """
+    dest = name.replace('-', '_')
+    parser.add_argument("--" + name, action="store_true", default=default, dest=dest, help=help)
+    parser.add_argument("--no-" + name, action="store_false", dest=dest)
+def get_wrapper_by_name(env, classname):
+    """Given an a gym environment possibly wrapped multiple times, returns a wrapper
+    of class named classname or raises ValueError if no such wrapper was applied
+    Parameters
+    ----------
+    env: gym.Env of gym.Wrapper
+        gym environment
+    classname: str
+        name of the wrapper
+    Returns
+    -------
+    wrapper: gym.Wrapper
+        wrapper named classname
+    """
+    currentenv = env
+    while True:
+        if classname == currentenv.class_name():
+            return currentenv
+        elif isinstance(currentenv, gym.Wrapper):
+            currentenv = currentenv.env
+        else:
+            raise ValueError("Couldn't find wrapper named %s" % classname)
+def relatively_safe_pickle_dump(obj, path, compression=False):
+    """This is just like regular pickle dump, except from the fact that failure cases are
+    different:
+        - It's never possible that we end up with a pickle in corrupted state.
+        - If a there was a different file at the path, that file will remain unchanged in the
+          even of failure (provided that filesystem rename is atomic).
+        - it is sometimes possible that we end up with useless temp file which needs to be
+          deleted manually (it will be removed automatically on the next function call)
+    The indended use case is periodic checkpoints of experiment state, such that we never
+    corrupt previous checkpoints if the current one fails.
+    Parameters
+    ----------
+    obj: object
+        object to pickle
+    path: str
+        path to the output file
+    compression: bool
+        if true pickle will be compressed
+    """
+    temp_storage = path + ".relatively_safe"
+    if compression:
+        # Using gzip here would be simpler, but the size is limited to 2GB
+        with tempfile.NamedTemporaryFile() as uncompressed_file:
+            pickle.dump(obj, uncompressed_file)
+            uncompressed_file.file.flush()
+            with zipfile.ZipFile(temp_storage, "w", compression=zipfile.ZIP_DEFLATED) as myzip:
+                myzip.write(uncompressed_file.name, "data")
+    else:
+        with open(temp_storage, "wb") as f:
+            pickle.dump(obj, f)
+    os.rename(temp_storage, path)
+def pickle_load(path, compression=False):
+    """Unpickle a possible compressed pickle.
+    Parameters
+    ----------
+    path: str
+        path to the output file
+    compression: bool
+        if true assumes that pickle was compressed when created and attempts decompression.
+    Returns
+    -------
+    obj: object
+        the unpickled object
+    """
+    if compression:
+        with zipfile.ZipFile(path, "r", compression=zipfile.ZIP_DEFLATED) as myzip:
+            with myzip.open("data") as f:
+                return pickle.load(f)
+    else:
+        with open(path, "rb") as f:
+            return pickle.load(f)

baselines/common/models.py ADDED Viewed

	@@ -0,0 +1,406 @@

+import math
+import numpy as np
+import tensorflow as tf
+from baselines.a2c import utils
+from baselines.a2c.utils import conv, fc, conv_to_fc, batch_to_seq, seq_to_batch
+from baselines.common.mpi_running_mean_std import RunningMeanStd
+from keras import layers
+from itertools import combinations
+mapping = {}
+def register(name):
+    def _thunk(func):
+        mapping[name] = func
+        return func
+    return _thunk
+def nature_cnn(unscaled_images, **conv_kwargs):
+    """
+    CNN from Nature paper.
+    """
+    scaled_images = tf.cast(unscaled_images, tf.float32) / 255.
+    activ = tf.nn.relu
+    h = activ(conv(scaled_images, 'c1', nf=32, rf=8, stride=4, init_scale=np.sqrt(2),
+                   **conv_kwargs))
+    h2 = activ(conv(h, 'c2', nf=64, rf=4, stride=2, init_scale=np.sqrt(2), **conv_kwargs))
+    h3 = activ(conv(h2, 'c3', nf=64, rf=3, stride=1, init_scale=np.sqrt(2), **conv_kwargs))
+    h3 = conv_to_fc(h3)
+    return activ(fc(h3, 'fc1', nh=512, init_scale=np.sqrt(2)))
+def build_impala_cnn(unscaled_images, depths=[16, 32, 32], **conv_kwargs):
+    """
+    Model used in the paper "IMPALA: Scalable Distributed Deep-RL with
+    Importance Weighted Actor-Learner Architectures" https://arxiv.org/abs/1802.01561
+    """
+    layer_num = 0
+    def get_layer_num_str():
+        nonlocal layer_num
+        num_str = str(layer_num)
+        layer_num += 1
+        return num_str
+    def conv_layer(out, depth):
+        return tf.compat.v1.layers.conv2d(out, depth, 3, padding='same', name='layer_' + get_layer_num_str())
+    def residual_block(inputs):
+        try:
+            depth = inputs.get_shape()[-1].value
+        except:
+            depth = inputs.get_shape()[-1]
+        out = tf.nn.relu(inputs)
+        out = conv_layer(out, depth)
+        out = tf.nn.relu(out)
+        out = conv_layer(out, depth)
+        return out + inputs
+    def conv_sequence(inputs, depth):
+        out = conv_layer(inputs, depth)
+        out = tf.compat.v1.layers.max_pooling2d(out, pool_size=3, strides=2, padding='same')
+        out = residual_block(out)
+        out = residual_block(out)
+        return out
+    out = tf.cast(unscaled_images, tf.float32) / 255.
+    for depth in depths:
+        out = conv_sequence(out, depth)
+    out = tf.compat.v1.layers.flatten(out)
+    out = tf.nn.relu(out)
+    out = tf.compat.v1.layers.dense(out, 256, activation=tf.nn.relu, name='layer_' + get_layer_num_str())
+    return out
+def build_skill_impala_cnn(unscaled_images, depths=[16, 32, 32], emb_dim=256, num_embeddings=8, seed=0,
+                           **conv_kwargs):
+    """
+    Modified impala cnn model by adding the skill module
+    """
+    layer_num = 0
+    def get_layer_num_str():
+        nonlocal layer_num
+        num_str = str(layer_num)
+        layer_num += 1
+        return num_str
+    def conv_layer(out, depth):
+        return tf.compat.v1.layers.conv2d(out, depth, 3, padding='same', name='layer_' + get_layer_num_str())
+    def residual_block(inputs):
+        # depth = inputs.get_shape()[-1].value
+        depth = inputs.get_shape()[-1]
+        out = tf.nn.relu(inputs)
+        out = conv_layer(out, depth)
+        out = tf.nn.relu(out)
+        out = conv_layer(out, depth)
+        return out + inputs
+    def conv_sequence(inputs, depth):
+        out = conv_layer(inputs, depth)
+        out = tf.compat.v1.layers.max_pooling2d(out, pool_size=3, strides=2, padding='same')
+        out = residual_block(out)
+        out = residual_block(out)
+        return out
+    out = tf.cast(unscaled_images, tf.float32) / 255.
+    for depth in depths:
+        out = conv_sequence(out, depth)
+    out = tf.compat.v1.layers.flatten(out)
+    out = tf.nn.relu(out)
+    pure_out = tf.compat.v1.layers.dense(out, emb_dim, activation=tf.nn.relu, name='layer_' + get_layer_num_str())
+    # skill module
+    skill_out = tf.compat.v1.layers.dense(pure_out, emb_dim // 2, activation=None, name='layer_' + get_layer_num_str())
+    skill_out = tf.compat.v1.layers.dense(skill_out, 2, activation=None, name='layer_' + get_layer_num_str())
+    vq_layer = VectorQuantizer(num_embeddings, 2, seed=seed, name="vector_quantizer")
+    vq_out, pure_vq_out, encoding_indices = vq_layer(skill_out)
+    encoding_indices_ = tf.cast(
+        tf.tile(encoding_indices / vq_layer.num_embeddings, tf.constant([1, emb_dim], tf.int32)), tf.float32)
+    # add the normalized skill indices to features
+    out = tf.math.add(pure_out, encoding_indices_)
+    return out, skill_out, pure_out, vq_out, pure_vq_out, vq_layer.embeddings, encoding_indices
+@register("mlp")
+def mlp(num_layers=2, num_hidden=64, activation=tf.tanh, layer_norm=False):
+    """
+    Stack of fully-connected layers to be used in a policy / q-function approximator
+    Parameters:
+    ----------
+    num_layers: int                 number of fully-connected layers (default: 2)
+    num_hidden: int                 size of fully-connected layers (default: 64)
+    activation:                     activation function (default: tf.tanh)
+    Returns:
+    -------
+    function that builds fully connected network with a given input tensor / placeholder
+    """
+    def network_fn(X):
+        h = tf.compat.v1.layers.flatten(X)
+        for i in range(num_layers):
+            h = fc(h, 'mlp_fc{}'.format(i), nh=num_hidden, init_scale=np.sqrt(2))
+            if layer_norm:
+                h = tf.contrib.layers.layer_norm(h, center=True, scale=True)
+            h = activation(h)
+        return h
+    return network_fn
+@register("cnn")
+def cnn(**conv_kwargs):
+    def network_fn(X):
+        return nature_cnn(X, **conv_kwargs)
+    return network_fn
+@register("impala_cnn")
+def impala_cnn(**conv_kwargs):
+    def network_fn(X):
+        return build_impala_cnn(X)
+    return network_fn
+@register("cnn_small")
+def cnn_small(**conv_kwargs):
+    def network_fn(X):
+        h = tf.cast(X, tf.float32) / 255.
+        activ = tf.nn.relu
+        h = activ(conv(h, 'c1', nf=8, rf=8, stride=4, init_scale=np.sqrt(2), **conv_kwargs))
+        h = activ(conv(h, 'c2', nf=16, rf=4, stride=2, init_scale=np.sqrt(2), **conv_kwargs))
+        h = conv_to_fc(h)
+        h = activ(fc(h, 'fc1', nh=128, init_scale=np.sqrt(2)))
+        return h
+    return network_fn
+@register("lstm")
+def lstm(nlstm=128, layer_norm=False):
+    """
+    Builds LSTM (Long-Short Term Memory) network to be used in a policy.
+    Note that the resulting function returns not only the output of the LSTM
+    (i.e. hidden state of lstm for each step in the sequence), but also a dictionary
+    with auxiliary tensors to be set as policy attributes.
+    Specifically,
+        S is a placeholder to feed current state (LSTM state has to be managed outside policy)
+        M is a placeholder for the mask (used to mask out observations after the end of the episode, but can be used for other purposes too)
+        initial_state is a numpy array containing initial lstm state (usually zeros)
+        state is the output LSTM state (to be fed into S at the next call)
+    An example of usage of lstm-based policy can be found here: common/tests/test_doc_examples.py/test_lstm_example
+    Parameters:
+    ----------
+    nlstm: int          LSTM hidden state size
+    layer_norm: bool    if True, layer-normalized version of LSTM is used
+    Returns:
+    -------
+    function that builds LSTM with a given input tensor / placeholder
+    """
+    def network_fn(X, nenv=1):
+        nbatch = X.shape[0]
+        nsteps = nbatch // nenv
+        h = tf.compat.v1.layers.flatten(X)
+        M = tf.compat.v1.placeholder(tf.float32, [nbatch])  # mask (done t-1)
+        S = tf.compat.v1.placeholder(tf.float32, [nenv, 2 * nlstm])  # states
+        xs = batch_to_seq(h, nenv, nsteps)
+        ms = batch_to_seq(M, nenv, nsteps)
+        if layer_norm:
+            h5, snew = utils.lnlstm(xs, ms, S, scope='lnlstm', nh=nlstm)
+        else:
+            h5, snew = utils.lstm(xs, ms, S, scope='lstm', nh=nlstm)
+        h = seq_to_batch(h5)
+        initial_state = np.zeros(S.shape.as_list(), dtype=float)
+        return h, {'S': S, 'M': M, 'state': snew, 'initial_state': initial_state}
+    return network_fn
+@register("cnn_lstm")
+def cnn_lstm(nlstm=128, layer_norm=False, conv_fn=nature_cnn, **conv_kwargs):
+    def network_fn(X, nenv=1):
+        nbatch = X.shape[0]
+        nsteps = nbatch // nenv
+        h = conv_fn(X, **conv_kwargs)
+        M = tf.compat.v1.placeholder(tf.float32, [nbatch])  # mask (done t-1)
+        S = tf.compat.v1.placeholder(tf.float32, [nenv, 2 * nlstm])  # states
+        xs = batch_to_seq(h, nenv, nsteps)
+        ms = batch_to_seq(M, nenv, nsteps)
+        if layer_norm:
+            h5, snew = utils.lnlstm(xs, ms, S, scope='lnlstm', nh=nlstm)
+        else:
+            h5, snew = utils.lstm(xs, ms, S, scope='lstm', nh=nlstm)
+        h = seq_to_batch(h5)
+        initial_state = np.zeros(S.shape.as_list(), dtype=float)
+        return h, {'S': S, 'M': M, 'state': snew, 'initial_state': initial_state}
+    return network_fn
+@register("impala_cnn_lstm")
+def impala_cnn_lstm():
+    return cnn_lstm(nlstm=256, conv_fn=build_impala_cnn)
+@register("cnn_lnlstm")
+def cnn_lnlstm(nlstm=128, **conv_kwargs):
+    return cnn_lstm(nlstm, layer_norm=True, **conv_kwargs)
+@register("conv_only")
+def conv_only(convs=[(32, 8, 4), (64, 4, 2), (64, 3, 1)], **conv_kwargs):
+    '''
+    convolutions-only net
+    Parameters:
+    ----------
+    conv:       list of triples (filter_number, filter_size, stride) specifying parameters for each layer.
+    Returns:
+    function that takes tensorflow tensor as input and returns the output of the last convolutional layer
+    '''
+    def network_fn(X):
+        out = tf.cast(X, tf.float32) / 255.
+        with tf.compat.v1.variable_scope("convnet"):
+            for num_outputs, kernel_size, stride in convs:
+                out = tf.contrib.layers.convolution2d(out,
+                                                      num_outputs=num_outputs,
+                                                      kernel_size=kernel_size,
+                                                      stride=stride,
+                                                      activation_fn=tf.nn.relu,
+                                                      **conv_kwargs)
+        return out
+    return network_fn
+def _normalize_clip_observation(x, clip_range=[-5.0, 5.0]):
+    rms = RunningMeanStd(shape=x.shape[1:])
+    norm_x = tf.clip_by_value((x - rms.mean) / rms.std, min(clip_range), max(clip_range))
+    return norm_x, rms
+def get_network_builder(name):
+    """
+    If you want to register your own network outside models.py, you just need:
+    Usage Example:
+    -------------
+    from baselines.common.models import register
+    @register("your_network_name")
+    def your_network_define(**net_kwargs):
+        ...
+        return network_fn
+    """
+    if callable(name):
+        return name
+    elif name in mapping:
+        return mapping[name]
+    else:
+        raise ValueError('Unknown network type: {}'.format(name))
+class VectorQuantizer(layers.Layer):
+    def __init__(self, num_embeddings, embedding_dim, seed=0, **kwargs):
+        super().__init__(**kwargs)
+        self.embedding_dim = embedding_dim
+        self.num_embeddings = num_embeddings
+        # Initialize the embeddings which we will quantize.
+        w_init = tf.compat.v1.random_uniform_initializer(minval=-1 / num_embeddings, maxval=1 / num_embeddings,
+                                                         seed=seed)
+        self.embeddings = tf.compat.v1.get_variable(
+            initializer=w_init(
+                shape=(self.embedding_dim, self.num_embeddings), dtype="float32"
+            ),
+            trainable=True,
+            name="embeddings_vqvae",
+        )
+    def call(self, x):
+        # Calculate the input shape of the inputs and
+        # then flatten the inputs keeping `embedding_dim` intact.
+        input_shape = tf.shape(input=x)
+        flattened = tf.reshape(x, [-1, self.embedding_dim])
+        # Quantization.
+        encoding_indices = self.get_code_indices(flattened)
+        encoding_indices = tf.reshape(encoding_indices, [input_shape[0], -1])
+        encodings = tf.one_hot(encoding_indices, self.num_embeddings)
+        quantized = tf.matmul(encodings, self.embeddings, transpose_b=True)
+        quantized = tf.reshape(quantized, input_shape)
+        # Straight-through estimator.
+        quantized_ = x + tf.stop_gradient(quantized - x)
+        return quantized_, quantized, encoding_indices
+    def get_code_indices(self, flattened_inputs):
+        # Calculate L2-normalized distance between the inputs and the codes.
+        similarity = tf.matmul(flattened_inputs, self.embeddings)
+        distances = (
+                tf.reduce_sum(input_tensor=flattened_inputs ** 2, axis=1, keepdims=True)
+                + tf.reduce_sum(input_tensor=self.embeddings ** 2, axis=0)
+                - 2 * similarity
+        )
+        # Derive the indices for minimum distances.
+        encoding_indices = tf.argmin(input=distances, axis=1)
+        return encoding_indices

baselines/common/mpi_adam.py ADDED Viewed

	@@ -0,0 +1,103 @@

+import baselines.common.tf_util as U
+import tensorflow as tf
+import numpy as np
+try:
+    from mpi4py import MPI
+except ImportError:
+    MPI = None
+class MpiAdam(object):
+    def __init__(self, var_list, *, beta1=0.9, beta2=0.999, epsilon=1e-08, scale_grad_by_procs=True, comm=None):
+        self.var_list = var_list
+        self.beta1 = beta1
+        self.beta2 = beta2
+        self.epsilon = epsilon
+        self.scale_grad_by_procs = scale_grad_by_procs
+        size = sum(U.numel(v) for v in var_list)
+        self.m = np.zeros(size, 'float32')
+        self.v = np.zeros(size, 'float32')
+        self.t = 0
+        self.setfromflat = U.SetFromFlat(var_list)
+        self.getflat = U.GetFlat(var_list)
+        self.comm = MPI.COMM_WORLD if comm is None and MPI is not None else comm
+    def update(self, localg, stepsize):
+        if self.t % 100 == 0:
+            self.check_synced()
+        localg = localg.astype('float32')
+        if self.comm is not None:
+            globalg = np.zeros_like(localg)
+            self.comm.Allreduce(localg, globalg, op=MPI.SUM)
+            if self.scale_grad_by_procs:
+                globalg /= self.comm.Get_size()
+        else:
+            globalg = np.copy(localg)
+        self.t += 1
+        a = stepsize * np.sqrt(1 - self.beta2**self.t)/(1 - self.beta1**self.t)
+        self.m = self.beta1 * self.m + (1 - self.beta1) * globalg
+        self.v = self.beta2 * self.v + (1 - self.beta2) * (globalg * globalg)
+        step = (- a) * self.m / (np.sqrt(self.v) + self.epsilon)
+        self.setfromflat(self.getflat() + step)
+    def sync(self):
+        if self.comm is None:
+            return
+        theta = self.getflat()
+        self.comm.Bcast(theta, root=0)
+        self.setfromflat(theta)
+    def check_synced(self):
+        if self.comm is None:
+            return
+        if self.comm.Get_rank() == 0: # this is root
+            theta = self.getflat()
+            self.comm.Bcast(theta, root=0)
+        else:
+            thetalocal = self.getflat()
+            thetaroot = np.empty_like(thetalocal)
+            self.comm.Bcast(thetaroot, root=0)
+            assert (thetaroot == thetalocal).all(), (thetaroot, thetalocal)
+@U.in_session
+def test_MpiAdam():
+    np.random.seed(0)
+    tf.compat.v1.set_random_seed(0)
+    a = tf.Variable(np.random.randn(3).astype('float32'))
+    b = tf.Variable(np.random.randn(2,5).astype('float32'))
+    loss = tf.reduce_sum(input_tensor=tf.square(a)) + tf.reduce_sum(input_tensor=tf.sin(b))
+    stepsize = 1e-2
+    update_op = tf.compat.v1.train.AdamOptimizer(stepsize).minimize(loss)
+    do_update = U.function([], loss, updates=[update_op])
+    tf.compat.v1.get_default_session().run(tf.compat.v1.global_variables_initializer())
+    losslist_ref = []
+    for i in range(10):
+        l = do_update()
+        print(i, l)
+        losslist_ref.append(l)
+    tf.compat.v1.set_random_seed(0)
+    tf.compat.v1.get_default_session().run(tf.compat.v1.global_variables_initializer())
+    var_list = [a,b]
+    lossandgrad = U.function([], [loss, U.flatgrad(loss, var_list)])
+    adam = MpiAdam(var_list)
+    losslist_test = []
+    for i in range(10):
+        l,g = lossandgrad()
+        adam.update(g, stepsize)
+        print(i,l)
+        losslist_test.append(l)
+    np.testing.assert_allclose(np.array(losslist_ref), np.array(losslist_test), atol=1e-4)
+if __name__ == '__main__':
+    test_MpiAdam()

baselines/common/mpi_adam_optimizer.py ADDED Viewed

	@@ -0,0 +1,90 @@

+import numpy as np
+import tensorflow as tf
+from baselines.common import tf_util as U
+from baselines.common.tests.test_with_mpi import with_mpi
+from baselines import logger
+try:
+    from mpi4py import MPI
+except ImportError:
+    MPI = None
+class MpiAdamOptimizer(tf.compat.v1.train.AdamOptimizer):
+    """Adam optimizer that averages gradients across mpi processes."""
+    def __init__(self, comm, grad_clip=None, mpi_rank_weight=1, **kwargs):
+        self.comm = comm
+        self.grad_clip = grad_clip
+        self.mpi_rank_weight = mpi_rank_weight
+        tf.compat.v1.train.AdamOptimizer.__init__(self, **kwargs)
+    def compute_gradients(self, loss, var_list, **kwargs):
+        grads_and_vars = tf.compat.v1.train.AdamOptimizer.compute_gradients(self, loss, var_list, **kwargs)
+        grads_and_vars = [(g, v) for g, v in grads_and_vars if g is not None]
+        flat_grad = tf.concat([tf.reshape(g, (-1,)) for g, v in grads_and_vars], axis=0) * self.mpi_rank_weight
+        shapes = [v.shape.as_list() for g, v in grads_and_vars]
+        sizes = [int(np.prod(s)) for s in shapes]
+        total_weight = np.zeros(1, np.float32)
+        self.comm.Allreduce(np.array([self.mpi_rank_weight], dtype=np.float32), total_weight, op=MPI.SUM)
+        total_weight = total_weight[0]
+        buf = np.zeros(sum(sizes), np.float32)
+        countholder = [0] # Counts how many times _collect_grads has been called
+        stat = tf.reduce_sum(input_tensor=grads_and_vars[0][1]) # sum of first variable
+        def _collect_grads(flat_grad, np_stat):
+            if self.grad_clip is not None:
+                gradnorm = np.linalg.norm(flat_grad)
+                if gradnorm > 1:
+                    flat_grad /= gradnorm
+                logger.logkv_mean('gradnorm', gradnorm)
+                logger.logkv_mean('gradclipfrac', float(gradnorm > 1))
+            self.comm.Allreduce(flat_grad, buf, op=MPI.SUM)
+            np.divide(buf, float(total_weight), out=buf)
+            if countholder[0] % 100 == 0:
+                check_synced(np_stat, self.comm)
+            countholder[0] += 1
+            return buf
+        avg_flat_grad = tf.compat.v1.py_func(_collect_grads, [flat_grad, stat], tf.float32)
+        avg_flat_grad.set_shape(flat_grad.shape)
+        avg_grads = tf.split(avg_flat_grad, sizes, axis=0)
+        avg_grads_and_vars = [(tf.reshape(g, v.shape), v)
+                    for g, (_, v) in zip(avg_grads, grads_and_vars)]
+        return avg_grads_and_vars
+def check_synced(localval, comm=None):
+    """
+    It's common to forget to initialize your variables to the same values, or
+    (less commonly) if you update them in some other way than adam, to get them out of sync.
+    This function checks that variables on all MPI workers are the same, and raises
+    an AssertionError otherwise
+    Arguments:
+        comm: MPI communicator
+        localval: list of local variables (list of variables on current worker to be compared with the other workers)
+    """
+    comm = comm or MPI.COMM_WORLD
+    vals = comm.gather(localval)
+    if comm.rank == 0:
+        assert all(val==vals[0] for val in vals[1:]),\
+            'MpiAdamOptimizer detected that different workers have different weights: {}'.format(vals)
+@with_mpi(timeout=5)
+def test_nonfreeze():
+    np.random.seed(0)
+    tf.compat.v1.set_random_seed(0)
+    a = tf.Variable(np.random.randn(3).astype('float32'))
+    b = tf.Variable(np.random.randn(2,5).astype('float32'))
+    loss = tf.reduce_sum(input_tensor=tf.square(a)) + tf.reduce_sum(input_tensor=tf.sin(b))
+    stepsize = 1e-2
+    # for some reason the session config with inter_op_parallelism_threads was causing
+    # nested sess.run calls to freeze
+    config = tf.compat.v1.ConfigProto(inter_op_parallelism_threads=1)
+    sess = U.get_session(config=config)
+    update_op = MpiAdamOptimizer(comm=MPI.COMM_WORLD, learning_rate=stepsize).minimize(loss)
+    sess.run(tf.compat.v1.global_variables_initializer())
+    losslist_ref = []
+    for i in range(100):
+        l,_ = sess.run([loss, update_op])
+        print(i, l)
+        losslist_ref.append(l)

baselines/common/mpi_fork.py ADDED Viewed

	@@ -0,0 +1,23 @@

+import os, subprocess, sys
+def mpi_fork(n, bind_to_core=False):
+    """Re-launches the current script with workers
+    Returns "parent" for original parent, "child" for MPI children
+    """
+    if n<=1:
+        return "child"
+    if os.getenv("IN_MPI") is None:
+        env = os.environ.copy()
+        env.update(
+            MKL_NUM_THREADS="1",
+            OMP_NUM_THREADS="1",
+            IN_MPI="1"
+        )
+        args = ["mpirun", "-np", str(n)]
+        if bind_to_core:
+            args += ["-bind-to", "core"]
+        args += [sys.executable] + sys.argv
+        subprocess.check_call(args, env=env)
+        return "parent"
+    else:
+        return "child"

baselines/common/mpi_moments.py ADDED Viewed

	@@ -0,0 +1,61 @@

+from mpi4py import MPI
+import numpy as np
+from baselines.common import zipsame
+def mpi_mean(x, axis=0, comm=None, keepdims=False):
+    x = np.asarray(x)
+    assert x.ndim > 0
+    if comm is None: comm = MPI.COMM_WORLD
+    xsum = x.sum(axis=axis, keepdims=keepdims)
+    n = xsum.size
+    localsum = np.zeros(n+1, x.dtype)
+    localsum[:n] = xsum.ravel()
+    localsum[n] = x.shape[axis]
+    # globalsum = np.zeros_like(localsum)
+    # comm.Allreduce(localsum, globalsum, op=MPI.SUM)
+    globalsum = comm.allreduce(localsum, op=MPI.SUM)
+    return globalsum[:n].reshape(xsum.shape) / globalsum[n], globalsum[n]
+def mpi_moments(x, axis=0, comm=None, keepdims=False):
+    x = np.asarray(x)
+    assert x.ndim > 0
+    mean, count = mpi_mean(x, axis=axis, comm=comm, keepdims=True)
+    sqdiffs = np.square(x - mean)
+    meansqdiff, count1 = mpi_mean(sqdiffs, axis=axis, comm=comm, keepdims=True)
+    assert count1 == count
+    std = np.sqrt(meansqdiff)
+    if not keepdims:
+        newshape = mean.shape[:axis] + mean.shape[axis+1:]
+        mean = mean.reshape(newshape)
+        std = std.reshape(newshape)
+    return mean, std, count
+def test_runningmeanstd():
+    import subprocess
+    subprocess.check_call(['mpirun', '-np', '3',
+        'python','-c',
+        'from baselines.common.mpi_moments import _helper_runningmeanstd; _helper_runningmeanstd()'])
+def _helper_runningmeanstd():
+    comm = MPI.COMM_WORLD
+    np.random.seed(0)
+    for (triple,axis) in [
+        ((np.random.randn(3), np.random.randn(4), np.random.randn(5)),0),
+        ((np.random.randn(3,2), np.random.randn(4,2), np.random.randn(5,2)),0),
+        ((np.random.randn(2,3), np.random.randn(2,4), np.random.randn(2,4)),1),
+        ]:
+        x = np.concatenate(triple, axis=axis)
+        ms1 = [x.mean(axis=axis), x.std(axis=axis), x.shape[axis]]
+        ms2 = mpi_moments(triple[comm.Get_rank()],axis=axis)
+        for (a1,a2) in zipsame(ms1, ms2):
+            print(a1, a2)
+            assert np.allclose(a1, a2)
+            print("ok!")

baselines/common/mpi_running_mean_std.py ADDED Viewed

	@@ -0,0 +1,112 @@

+try:
+    from mpi4py import MPI
+except ImportError:
+    MPI = None
+import tensorflow as tf, baselines.common.tf_util as U, numpy as np
+class RunningMeanStd(object):
+    # https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#Parallel_algorithm
+    def __init__(self, epsilon=1e-2, shape=()):
+        self._sum = tf.compat.v1.get_variable(
+            dtype=tf.float64,
+            shape=shape,
+            initializer=tf.compat.v1.constant_initializer(0.0),
+            name="runningsum", trainable=False)
+        self._sumsq = tf.compat.v1.get_variable(
+            dtype=tf.float64,
+            shape=shape,
+            initializer=tf.compat.v1.constant_initializer(epsilon),
+            name="runningsumsq", trainable=False)
+        self._count = tf.compat.v1.get_variable(
+            dtype=tf.float64,
+            shape=(),
+            initializer=tf.compat.v1.constant_initializer(epsilon),
+            name="count", trainable=False)
+        self.shape = shape
+        self.mean = tf.cast(self._sum / self._count, dtype=tf.float32)
+        self.std = tf.sqrt( tf.maximum( tf.cast(self._sumsq / self._count, dtype=tf.float32) - tf.square(self.mean) , 1e-2 ))
+        newsum = tf.compat.v1.placeholder(shape=self.shape, dtype=tf.float64, name='sum')
+        newsumsq = tf.compat.v1.placeholder(shape=self.shape, dtype=tf.float64, name='var')
+        newcount = tf.compat.v1.placeholder(shape=[], dtype=tf.float64, name='count')
+        self.incfiltparams = U.function([newsum, newsumsq, newcount], [],
+            updates=[tf.compat.v1.assign_add(self._sum, newsum),
+                     tf.compat.v1.assign_add(self._sumsq, newsumsq),
+                     tf.compat.v1.assign_add(self._count, newcount)])
+    def update(self, x):
+        x = x.astype('float64')
+        n = int(np.prod(self.shape))
+        totalvec = np.zeros(n*2+1, 'float64')
+        addvec = np.concatenate([x.sum(axis=0).ravel(), np.square(x).sum(axis=0).ravel(), np.array([len(x)],dtype='float64')])
+        if MPI is not None:
+            MPI.COMM_WORLD.Allreduce(addvec, totalvec, op=MPI.SUM)
+        self.incfiltparams(totalvec[0:n].reshape(self.shape), totalvec[n:2*n].reshape(self.shape), totalvec[2*n])
+@U.in_session
+def test_runningmeanstd():
+    for (x1, x2, x3) in [
+        (np.random.randn(3), np.random.randn(4), np.random.randn(5)),
+        (np.random.randn(3,2), np.random.randn(4,2), np.random.randn(5,2)),
+        ]:
+        rms = RunningMeanStd(epsilon=0.0, shape=x1.shape[1:])
+        U.initialize()
+        x = np.concatenate([x1, x2, x3], axis=0)
+        ms1 = [x.mean(axis=0), x.std(axis=0)]
+        rms.update(x1)
+        rms.update(x2)
+        rms.update(x3)
+        ms2 = [rms.mean.eval(), rms.std.eval()]
+        assert np.allclose(ms1, ms2)
+@U.in_session
+def test_dist():
+    np.random.seed(0)
+    p1,p2,p3=(np.random.randn(3,1), np.random.randn(4,1), np.random.randn(5,1))
+    q1,q2,q3=(np.random.randn(6,1), np.random.randn(7,1), np.random.randn(8,1))
+    # p1,p2,p3=(np.random.randn(3), np.random.randn(4), np.random.randn(5))
+    # q1,q2,q3=(np.random.randn(6), np.random.randn(7), np.random.randn(8))
+    comm = MPI.COMM_WORLD
+    assert comm.Get_size()==2
+    if comm.Get_rank()==0:
+        x1,x2,x3 = p1,p2,p3
+    elif comm.Get_rank()==1:
+        x1,x2,x3 = q1,q2,q3
+    else:
+        assert False
+    rms = RunningMeanStd(epsilon=0.0, shape=(1,))
+    U.initialize()
+    rms.update(x1)
+    rms.update(x2)
+    rms.update(x3)
+    bigvec = np.concatenate([p1,p2,p3,q1,q2,q3])
+    def checkallclose(x,y):
+        print(x,y)
+        return np.allclose(x,y)
+    assert checkallclose(
+        bigvec.mean(axis=0),
+        rms.mean.eval(),
+    )
+    assert checkallclose(
+        bigvec.std(axis=0),
+        rms.std.eval(),
+    )
+if __name__ == "__main__":
+    # Run with mpirun -np 2 python <filename>
+    test_dist()

baselines/common/mpi_util.py ADDED Viewed

	@@ -0,0 +1,133 @@

+from collections import defaultdict
+import os, numpy as np
+import platform
+import shutil
+import subprocess
+import warnings
+import sys
+try:
+    from mpi4py import MPI
+except ImportError:
+    MPI = None
+def sync_from_root(sess, variables, comm=None):
+    """
+    Send the root node's parameters to every worker.
+    Arguments:
+      sess: the TensorFlow session.
+      variables: all parameter variables including optimizer's
+    """
+    if comm is None: comm = MPI.COMM_WORLD
+    import tensorflow as tf
+    values = comm.bcast(sess.run(variables))
+    sess.run([tf.compat.v1.assign(var, val)
+        for (var, val) in zip(variables, values)])
+def gpu_count():
+    """
+    Count the GPUs on this machine.
+    """
+    if shutil.which('nvidia-smi') is None:
+        return 0
+    output = subprocess.check_output(['nvidia-smi', '--query-gpu=gpu_name', '--format=csv'])
+    return max(0, len(output.split(b'\n')) - 2)
+def setup_mpi_gpus():
+    """
+    Set CUDA_VISIBLE_DEVICES to MPI rank if not already set
+    """
+    if 'CUDA_VISIBLE_DEVICES' not in os.environ:
+        if sys.platform == 'darwin': # This Assumes if you're on OSX you're just
+            ids = []                 # doing a smoke test and don't want GPUs
+        else:
+            lrank, _lsize = get_local_rank_size(MPI.COMM_WORLD)
+            ids = [lrank]
+        os.environ["CUDA_VISIBLE_DEVICES"] = ",".join(map(str, ids))
+def get_local_rank_size(comm):
+    """
+    Returns the rank of each process on its machine
+    The processes on a given machine will be assigned ranks
+        0, 1, 2, ..., N-1,
+    where N is the number of processes on this machine.
+    Useful if you want to assign one gpu per machine
+    """
+    this_node = platform.node()
+    ranks_nodes = comm.allgather((comm.Get_rank(), this_node))
+    node2rankssofar = defaultdict(int)
+    local_rank = None
+    for (rank, node) in ranks_nodes:
+        if rank == comm.Get_rank():
+            local_rank = node2rankssofar[node]
+        node2rankssofar[node] += 1
+    assert local_rank is not None
+    return local_rank, node2rankssofar[this_node]
+def share_file(comm, path):
+    """
+    Copies the file from rank 0 to all other ranks
+    Puts it in the same place on all machines
+    """
+    localrank, _ = get_local_rank_size(comm)
+    if comm.Get_rank() == 0:
+        with open(path, 'rb') as fh:
+            data = fh.read()
+        comm.bcast(data)
+    else:
+        data = comm.bcast(None)
+        if localrank == 0:
+            os.makedirs(os.path.dirname(path), exist_ok=True)
+            with open(path, 'wb') as fh:
+                fh.write(data)
+    comm.Barrier()
+def dict_gather(comm, d, op='mean', assert_all_have_data=True):
+    """
+    Perform a reduction operation over dicts
+    """
+    if comm is None: return d
+    alldicts = comm.allgather(d)
+    size = comm.size
+    k2li = defaultdict(list)
+    for d in alldicts:
+        for (k,v) in d.items():
+            k2li[k].append(v)
+    result = {}
+    for (k,li) in k2li.items():
+        if assert_all_have_data:
+            assert len(li)==size, "only %i out of %i MPI workers have sent '%s'" % (len(li), size, k)
+        if op=='mean':
+            result[k] = np.mean(li, axis=0)
+        elif op=='sum':
+            result[k] = np.sum(li, axis=0)
+        else:
+            assert 0, op
+    return result
+def mpi_weighted_mean(comm, local_name2valcount):
+    """
+    Perform a weighted average over dicts that are each on a different node
+    Input: local_name2valcount: dict mapping key -> (value, count)
+    Returns: key -> mean
+    """
+    all_name2valcount = comm.gather(local_name2valcount)
+    if comm.rank == 0:
+        name2sum = defaultdict(float)
+        name2count = defaultdict(float)
+        for n2vc in all_name2valcount:
+            for (name, (val, count)) in n2vc.items():
+                try:
+                    val = float(val)
+                except ValueError:
+                    if comm.rank == 0:
+                        warnings.warn('WARNING: tried to compute mean on non-float {}={}'.format(name, val))
+                else:
+                    name2sum[name] += val * count
+                    name2count[name] += count
+        return {name : name2sum[name] / name2count[name] for name in name2sum}
+    else:
+        return {}

baselines/common/plot_util.py ADDED Viewed

	@@ -0,0 +1,434 @@

+import matplotlib.pyplot as plt
+import os.path as osp
+import json
+import os
+import numpy as np
+import pandas
+from collections import defaultdict, namedtuple
+from baselines.bench import monitor
+from baselines.logger import read_json, read_csv
+def smooth(y, radius, mode='two_sided', valid_only=False):
+    '''
+    Smooth signal y, where radius is determines the size of the window
+    mode='twosided':
+        average over the window [max(index - radius, 0), min(index + radius, len(y)-1)]
+    mode='causal':
+        average over the window [max(index - radius, 0), index]
+    valid_only: put nan in entries where the full-sized window is not available
+    '''
+    assert mode in ('two_sided', 'causal')
+    if len(y) < 2*radius+1:
+        return np.ones_like(y) * y.mean()
+    elif mode == 'two_sided':
+        convkernel = np.ones(2 * radius+1)
+        out = np.convolve(y, convkernel,mode='same') / np.convolve(np.ones_like(y), convkernel, mode='same')
+        if valid_only:
+            out[:radius] = out[-radius:] = np.nan
+    elif mode == 'causal':
+        convkernel = np.ones(radius)
+        out = np.convolve(y, convkernel,mode='full') / np.convolve(np.ones_like(y), convkernel, mode='full')
+        out = out[:-radius+1]
+        if valid_only:
+            out[:radius] = np.nan
+    return out
+def one_sided_ema(xolds, yolds, low=None, high=None, n=512, decay_steps=1., low_counts_threshold=1e-8):
+    '''
+    perform one-sided (causal) EMA (exponential moving average)
+    smoothing and resampling to an even grid with n points.
+    Does not do extrapolation, so we assume
+    xolds[0] <= low && high <= xolds[-1]
+    Arguments:
+    xolds: array or list  - x values of data. Needs to be sorted in ascending order
+    yolds: array of list  - y values of data. Has to have the same length as xolds
+    low: float            - min value of the new x grid. By default equals to xolds[0]
+    high: float           - max value of the new x grid. By default equals to xolds[-1]
+    n: int                - number of points in new x grid
+    decay_steps: float    - EMA decay factor, expressed in new x grid steps.
+    low_counts_threshold: float or int
+                          - y values with counts less than this value will be set to NaN
+    Returns:
+        tuple sum_ys, count_ys where
+            xs        - array with new x grid
+            ys        - array of EMA of y at each point of the new x grid
+            count_ys  - array of EMA of y counts at each point of the new x grid
+    '''
+    low = xolds[0] if low is None else low
+    high = xolds[-1] if high is None else high
+    assert xolds[0] <= low, 'low = {} < xolds[0] = {} - extrapolation not permitted!'.format(low, xolds[0])
+    assert xolds[-1] >= high, 'high = {} > xolds[-1] = {}  - extrapolation not permitted!'.format(high, xolds[-1])
+    assert len(xolds) == len(yolds), 'length of xolds ({}) and yolds ({}) do not match!'.format(len(xolds), len(yolds))
+    xolds = xolds.astype('float64')
+    yolds = yolds.astype('float64')
+    luoi = 0 # last unused old index
+    sum_y = 0.
+    count_y = 0.
+    xnews = np.linspace(low, high, n)
+    decay_period = (high - low) / (n - 1) * decay_steps
+    interstep_decay = np.exp(- 1. / decay_steps)
+    sum_ys = np.zeros_like(xnews)
+    count_ys = np.zeros_like(xnews)
+    for i in range(n):
+        xnew = xnews[i]
+        sum_y *= interstep_decay
+        count_y *= interstep_decay
+        while True:
+            if luoi >= len(xolds):
+                break
+            xold = xolds[luoi]
+            if xold <= xnew:
+                decay = np.exp(- (xnew - xold) / decay_period)
+                sum_y += decay * yolds[luoi]
+                count_y += decay
+                luoi += 1
+            else:
+                break
+        sum_ys[i] = sum_y
+        count_ys[i] = count_y
+    ys = sum_ys / count_ys
+    ys[count_ys < low_counts_threshold] = np.nan
+    return xnews, ys, count_ys
+def symmetric_ema(xolds, yolds, low=None, high=None, n=512, decay_steps=1., low_counts_threshold=1e-8):
+    '''
+    perform symmetric EMA (exponential moving average)
+    smoothing and resampling to an even grid with n points.
+    Does not do extrapolation, so we assume
+    xolds[0] <= low && high <= xolds[-1]
+    Arguments:
+    xolds: array or list  - x values of data. Needs to be sorted in ascending order
+    yolds: array of list  - y values of data. Has to have the same length as xolds
+    low: float            - min value of the new x grid. By default equals to xolds[0]
+    high: float           - max value of the new x grid. By default equals to xolds[-1]
+    n: int                - number of points in new x grid
+    decay_steps: float    - EMA decay factor, expressed in new x grid steps.
+    low_counts_threshold: float or int
+                          - y values with counts less than this value will be set to NaN
+    Returns:
+        tuple sum_ys, count_ys where
+            xs        - array with new x grid
+            ys        - array of EMA of y at each point of the new x grid
+            count_ys  - array of EMA of y counts at each point of the new x grid
+    '''
+    xs, ys1, count_ys1 = one_sided_ema(xolds, yolds, low, high, n, decay_steps, low_counts_threshold=0)
+    _,  ys2, count_ys2 = one_sided_ema(-xolds[::-1], yolds[::-1], -high, -low, n, decay_steps, low_counts_threshold=0)
+    ys2 = ys2[::-1]
+    count_ys2 = count_ys2[::-1]
+    count_ys = count_ys1 + count_ys2
+    ys = (ys1 * count_ys1 + ys2 * count_ys2) / count_ys
+    ys[count_ys < low_counts_threshold] = np.nan
+    return xs, ys, count_ys
+Result = namedtuple('Result', 'monitor progress dirname metadata')
+Result.__new__.__defaults__ = (None,) * len(Result._fields)
+def load_results(root_dir_or_dirs, enable_progress=True, enable_monitor=True, verbose=False):
+    '''
+    load summaries of runs from a list of directories (including subdirectories)
+    Arguments:
+    enable_progress: bool - if True, will attempt to load data from progress.csv files (data saved by logger). Default: True
+    enable_monitor: bool - if True, will attempt to load data from monitor.csv files (data saved by Monitor environment wrapper). Default: True
+    verbose: bool - if True, will print out list of directories from which the data is loaded. Default: False
+    Returns:
+    List of Result objects with the following fields:
+         - dirname - path to the directory data was loaded from
+         - metadata - run metadata (such as command-line arguments and anything else in metadata.json file
+         - monitor - if enable_monitor is True, this field contains pandas dataframe with loaded monitor.csv file (or aggregate of all *.monitor.csv files in the directory)
+         - progress - if enable_progress is True, this field contains pandas dataframe with loaded progress.csv file
+    '''
+    import re
+    if isinstance(root_dir_or_dirs, str):
+        rootdirs = [osp.expanduser(root_dir_or_dirs)]
+    else:
+        rootdirs = [osp.expanduser(d) for d in root_dir_or_dirs]
+    allresults = []
+    for rootdir in rootdirs:
+        assert osp.exists(rootdir), "%s doesn't exist"%rootdir
+        for dirname, dirs, files in os.walk(rootdir):
+            if '-proc' in dirname:
+                files[:] = []
+                continue
+            monitor_re = re.compile(r'(\d+\.)?(\d+\.)?monitor\.csv')
+            if set(['metadata.json', 'monitor.json', 'progress.json', 'progress.csv']).intersection(files) or \
+               any([f for f in files if monitor_re.match(f)]):  # also match monitor files like 0.1.monitor.csv
+                # used to be uncommented, which means do not go deeper than current directory if any of the data files
+                # are found
+                # dirs[:] = []
+                result = {'dirname' : dirname}
+                if "metadata.json" in files:
+                    with open(osp.join(dirname, "metadata.json"), "r") as fh:
+                        result['metadata'] = json.load(fh)
+                progjson = osp.join(dirname, "progress.json")
+                progcsv = osp.join(dirname, "progress.csv")
+                if enable_progress:
+                    if osp.exists(progjson):
+                        result['progress'] = pandas.DataFrame(read_json(progjson))
+                    elif osp.exists(progcsv):
+                        try:
+                            result['progress'] = read_csv(progcsv)
+                        except pandas.errors.EmptyDataError:
+                            print('skipping progress file in ', dirname, 'empty data')
+                    else:
+                        if verbose: print('skipping %s: no progress file'%dirname)
+                if enable_monitor:
+                    try:
+                        result['monitor'] = pandas.DataFrame(monitor.load_results(dirname))
+                    except monitor.LoadMonitorResultsError:
+                        print('skipping %s: no monitor files'%dirname)
+                    except Exception as e:
+                        print('exception loading monitor file in %s: %s'%(dirname, e))
+                if result.get('monitor') is not None or result.get('progress') is not None:
+                    allresults.append(Result(**result))
+                    if verbose:
+                        print('successfully loaded %s'%dirname)
+    if verbose: print('loaded %i results'%len(allresults))
+    return allresults
+COLORS = ['blue', 'green', 'red', 'cyan', 'magenta', 'yellow', 'black', 'purple', 'pink',
+        'brown', 'orange', 'teal',  'lightblue', 'lime', 'lavender', 'turquoise',
+        'darkgreen', 'tan', 'salmon', 'gold',  'darkred', 'darkblue']
+def default_xy_fn(r):
+    x = np.cumsum(r.monitor.l)
+    y = smooth(r.monitor.r, radius=10)
+    return x,y
+def default_split_fn(r):
+    import re
+    # match name between slash and -<digits> at the end of the string
+    # (slash in the beginning or -<digits> in the end or either may be missing)
+    match = re.search(r'[^/-]+(?=(-\d+)?\Z)', r.dirname)
+    if match:
+        return match.group(0)
+def plot_results(
+    allresults, *,
+    xy_fn=default_xy_fn,
+    split_fn=default_split_fn,
+    group_fn=default_split_fn,
+    average_group=False,
+    shaded_std=True,
+    shaded_err=True,
+    figsize=None,
+    legend_outside=False,
+    resample=0,
+    smooth_step=1.0,
+    tiling='vertical',
+    xlabel=None,
+    ylabel=None
+):
+    '''
+    Plot multiple Results objects
+    xy_fn: function Result -> x,y           - function that converts results objects into tuple of x and y values.
+                                              By default, x is cumsum of episode lengths, and y is episode rewards
+    split_fn: function Result -> hashable   - function that converts results objects into keys to split curves into sub-panels by.
+                                              That is, the results r for which split_fn(r) is different will be put on different sub-panels.
+                                              By default, the portion of r.dirname between last / and -<digits> is returned. The sub-panels are
+                                              stacked vertically in the figure.
+    group_fn: function Result -> hashable   - function that converts results objects into keys to group curves by.
+                                              That is, the results r for which group_fn(r) is the same will be put into the same group.
+                                              Curves in the same group have the same color (if average_group is False), or averaged over
+                                              (if average_group is True). The default value is the same as default value for split_fn
+    average_group: bool                     - if True, will average the curves in the same group and plot the mean. Enables resampling
+                                              (if resample = 0, will use 512 steps)
+    shaded_std: bool                        - if True (default), the shaded region corresponding to standard deviation of the group of curves will be
+                                              shown (only applicable if average_group = True)
+    shaded_err: bool                        - if True (default), the shaded region corresponding to error in mean estimate of the group of curves
+                                              (that is, standard deviation divided by square root of number of curves) will be
+                                              shown (only applicable if average_group = True)
+    figsize: tuple or None                  - size of the resulting figure (including sub-panels). By default, width is 6 and height is 6 times number of
+                                              sub-panels.
+    legend_outside: bool                    - if True, will place the legend outside of the sub-panels.
+    resample: int                           - if not zero, size of the uniform grid in x direction to resample onto. Resampling is performed via symmetric
+                                              EMA smoothing (see the docstring for symmetric_ema).
+                                              Default is zero (no resampling). Note that if average_group is True, resampling is necessary; in that case, default
+                                              value is 512.
+    smooth_step: float                      - when resampling (i.e. when resample > 0 or average_group is True), use this EMA decay parameter (in units of the new grid step).
+                                              See docstrings for decay_steps in symmetric_ema or one_sided_ema functions.
+    '''
+    if split_fn is None: split_fn = lambda _ : ''
+    if group_fn is None: group_fn = lambda _ : ''
+    sk2r = defaultdict(list) # splitkey2results
+    for result in allresults:
+        splitkey = split_fn(result)
+        sk2r[splitkey].append(result)
+    assert len(sk2r) > 0
+    assert isinstance(resample, int), "0: don't resample. <integer>: that many samples"
+    if tiling == 'vertical' or tiling is None:
+        nrows = len(sk2r)
+        ncols = 1
+    elif tiling == 'horizontal':
+        ncols = len(sk2r)
+        nrows = 1
+    elif tiling == 'symmetric':
+        import math
+        N = len(sk2r)
+        largest_divisor = 1
+        for i in range(1, int(math.sqrt(N))+1):
+            if N % i == 0:
+                largest_divisor = i
+        ncols = largest_divisor
+        nrows = N // ncols
+    figsize = figsize or (6 * ncols, 6 * nrows)
+    f, axarr = plt.subplots(nrows, ncols, sharex=False, squeeze=False, figsize=figsize)
+    groups = list(set(group_fn(result) for result in allresults))
+    default_samples = 512
+    if average_group:
+        resample = resample or default_samples
+    for (isplit, sk) in enumerate(sorted(sk2r.keys())):
+        g2l = {}
+        g2c = defaultdict(int)
+        sresults = sk2r[sk]
+        gresults = defaultdict(list)
+        idx_row = isplit // ncols
+        idx_col = isplit % ncols
+        ax = axarr[idx_row][idx_col]
+        for result in sresults:
+            group = group_fn(result)
+            g2c[group] += 1
+            x, y = xy_fn(result)
+            if x is None: x = np.arange(len(y))
+            x, y = map(np.asarray, (x, y))
+            if average_group:
+                gresults[group].append((x,y))
+            else:
+                if resample:
+                    x, y, counts = symmetric_ema(x, y, x[0], x[-1], resample, decay_steps=smooth_step)
+                l, = ax.plot(x, y, color=COLORS[groups.index(group) % len(COLORS)])
+                g2l[group] = l
+        if average_group:
+            for group in sorted(groups):
+                xys = gresults[group]
+                if not any(xys):
+                    continue
+                color = COLORS[groups.index(group) % len(COLORS)]
+                origxs = [xy[0] for xy in xys]
+                minxlen = min(map(len, origxs))
+                def allequal(qs):
+                    return all((q==qs[0]).all() for q in qs[1:])
+                if resample:
+                    low  = max(x[0] for x in origxs)
+                    high = min(x[-1] for x in origxs)
+                    usex = np.linspace(low, high, resample)
+                    ys = []
+                    for (x, y) in xys:
+                        ys.append(symmetric_ema(x, y, low, high, resample, decay_steps=smooth_step)[1])
+                else:
+                    assert allequal([x[:minxlen] for x in origxs]),\
+                        'If you want to average unevenly sampled data, set resample=<number of samples you want>'
+                    usex = origxs[0]
+                    ys = [xy[1][:minxlen] for xy in xys]
+                ymean = np.mean(ys, axis=0)
+                ystd = np.std(ys, axis=0)
+                ystderr = ystd / np.sqrt(len(ys))
+                l, = axarr[idx_row][idx_col].plot(usex, ymean, color=color)
+                g2l[group] = l
+                if shaded_err:
+                    ax.fill_between(usex, ymean - ystderr, ymean + ystderr, color=color, alpha=.4)
+                if shaded_std:
+                    ax.fill_between(usex, ymean - ystd,    ymean + ystd,    color=color, alpha=.2)
+        # https://matplotlib.org/users/legend_guide.html
+        plt.tight_layout()
+        if any(g2l.keys()):
+            ax.legend(
+                g2l.values(),
+                ['%s (%i)'%(g, g2c[g]) for g in g2l] if average_group else g2l.keys(),
+                loc=2 if legend_outside else None,
+                bbox_to_anchor=(1,1) if legend_outside else None)
+        ax.set_title(sk)
+        # add xlabels, but only to the bottom row
+        if xlabel is not None:
+            for ax in axarr[-1]:
+                plt.sca(ax)
+                plt.xlabel(xlabel)
+        # add ylabels, but only to left column
+        if ylabel is not None:
+            for ax in axarr[:,0]:
+                plt.sca(ax)
+                plt.ylabel(ylabel)
+    return f, axarr
+def regression_analysis(df):
+    xcols = list(df.columns.copy())
+    xcols.remove('score')
+    ycols = ['score']
+    import statsmodels.api as sm
+    mod = sm.OLS(df[ycols], sm.add_constant(df[xcols]), hasconst=False)
+    res = mod.fit()
+    print(res.summary())
+def test_smooth():
+    norig = 100
+    nup = 300
+    ndown = 30
+    xs = np.cumsum(np.random.rand(norig) * 10 / norig)
+    yclean = np.sin(xs)
+    ys = yclean + .1 * np.random.randn(yclean.size)
+    xup, yup, _ = symmetric_ema(xs, ys, xs.min(), xs.max(), nup, decay_steps=nup/ndown)
+    xdown, ydown, _ = symmetric_ema(xs, ys, xs.min(), xs.max(), ndown, decay_steps=ndown/ndown)
+    xsame, ysame, _ = symmetric_ema(xs, ys, xs.min(), xs.max(), norig, decay_steps=norig/ndown)
+    plt.plot(xs, ys, label='orig', marker='x')
+    plt.plot(xup, yup, label='up', marker='x')
+    plt.plot(xdown, ydown, label='down', marker='x')
+    plt.plot(xsame, ysame, label='same', marker='x')
+    plt.plot(xs, yclean, label='clean', marker='x')
+    plt.legend()
+    plt.show()

baselines/common/policies.py ADDED Viewed

	@@ -0,0 +1,186 @@

+import tensorflow as tf
+from baselines.common import tf_util
+from baselines.a2c.utils import fc
+from baselines.common.distributions import make_pdtype
+from baselines.common.input import observation_placeholder, encode_observation
+from baselines.common.tf_util import adjust_shape
+from baselines.common.mpi_running_mean_std import RunningMeanStd
+from baselines.common.models import get_network_builder
+import gym
+class PolicyWithValue(object):
+    """
+    Encapsulates fields and methods for RL policy and value function estimation with shared parameters
+    """
+    def __init__(self, env, observations, latent, estimate_q=False, vf_latent=None, sess=None, **tensors):
+        """
+        Parameters:
+        ----------
+        env             RL environment
+        observations    tensorflow placeholder in which the observations will be fed
+        latent          latent state from which policy distribution parameters should be inferred
+        vf_latent       latent state from which value function should be inferred (if None, then latent is used)
+        sess            tensorflow session to run calculations in (if None, default session is used)
+        **tensors       tensorflow tensors for additional attributes such as state or mask
+        """
+        self.X = observations
+        self.state = tf.constant([])
+        self.initial_state = None
+        self.__dict__.update(tensors)
+        vf_latent = vf_latent if vf_latent is not None else latent
+        vf_latent = tf.compat.v1.layers.flatten(vf_latent)
+        latent = tf.compat.v1.layers.flatten(latent)
+        # Based on the action space, will select what probability distribution type
+        self.pdtype = make_pdtype(env.action_space)
+        self.pd, self.pi = self.pdtype.pdfromlatent(latent, init_scale=0.01)
+        # Take an action
+        self.action = self.pd.sample()
+        # Calculate the neg log of our probability
+        self.neglogp = self.pd.neglogp(self.action)
+        self.sess = sess or tf.compat.v1.get_default_session()
+        if estimate_q:
+            assert isinstance(env.action_space, gym.spaces.Discrete)
+            self.q = fc(vf_latent, 'q', env.action_space.n)
+            self.vf = self.q
+        else:
+            self.vf = fc(vf_latent, 'vf', 1)
+            self.vf = self.vf[:,0]
+    def _evaluate(self, variables, observation, **extra_feed):
+        sess = self.sess
+        feed_dict = {self.X: adjust_shape(self.X, observation)}
+        for inpt_name, data in extra_feed.items():
+            if inpt_name in self.__dict__.keys():
+                inpt = self.__dict__[inpt_name]
+                if isinstance(inpt, tf.Tensor) and inpt._op.type == 'Placeholder':
+                    feed_dict[inpt] = adjust_shape(inpt, data)
+        return sess.run(variables, feed_dict)
+    def step(self, observation, **extra_feed):
+        """
+        Compute next action(s) given the observation(s)
+        Parameters:
+        ----------
+        observation     observation data (either single or a batch)
+        **extra_feed    additional data such as state or mask (names of the arguments should match the ones in constructor, see __init__)
+        Returns:
+        -------
+        (action, value estimate, next state, negative log likelihood of the action under current policy parameters) tuple
+        """
+        a, v, state, neglogp = self._evaluate([self.action, self.vf, self.state, self.neglogp], observation, **extra_feed)
+        if state.size == 0:
+            state = None
+        return a, v, state, neglogp
+    def value(self, ob, *args, **kwargs):
+        """
+        Compute value estimate(s) given the observation(s)
+        Parameters:
+        ----------
+        observation     observation data (either single or a batch)
+        **extra_feed    additional data such as state or mask (names of the arguments should match the ones in constructor, see __init__)
+        Returns:
+        -------
+        value estimate
+        """
+        return self._evaluate(self.vf, ob, *args, **kwargs)
+    def save(self, save_path):
+        tf_util.save_state(save_path, sess=self.sess)
+    def load(self, load_path):
+        tf_util.load_state(load_path, sess=self.sess)
+def build_policy(env, policy_network, value_network=None,  normalize_observations=False, estimate_q=False, **policy_kwargs):
+    if isinstance(policy_network, str):
+        network_type = policy_network
+        policy_network = get_network_builder(network_type)(**policy_kwargs)
+    def policy_fn(nbatch=None, nsteps=None, sess=None, observ_placeholder=None):
+        ob_space = env.observation_space
+        X = observ_placeholder if observ_placeholder is not None else observation_placeholder(ob_space, batch_size=nbatch)
+        extra_tensors = {}
+        if normalize_observations and X.dtype == tf.float32:
+            encoded_x, rms = _normalize_clip_observation(X)
+            extra_tensors['rms'] = rms
+        else:
+            encoded_x = X
+        encoded_x = encode_observation(ob_space, encoded_x)
+        with tf.compat.v1.variable_scope('pi', reuse=tf.compat.v1.AUTO_REUSE):
+            policy_latent = policy_network(encoded_x)
+            if isinstance(policy_latent, tuple):
+                policy_latent, recurrent_tensors = policy_latent
+                if recurrent_tensors is not None:
+                    # recurrent architecture, need a few more steps
+                    nenv = nbatch // nsteps
+                    assert nenv > 0, 'Bad input for recurrent policy: batch size {} smaller than nsteps {}'.format(nbatch, nsteps)
+                    policy_latent, recurrent_tensors = policy_network(encoded_x, nenv)
+                    extra_tensors.update(recurrent_tensors)
+        _v_net = value_network
+        if _v_net is None or _v_net == 'shared':
+            vf_latent = policy_latent
+        else:
+            if _v_net == 'copy':
+                _v_net = policy_network
+            else:
+                assert callable(_v_net)
+            with tf.compat.v1.variable_scope('vf', reuse=tf.compat.v1.AUTO_REUSE):
+                # TODO recurrent architectures are not supported with value_network=copy yet
+                vf_latent = _v_net(encoded_x)
+        policy = PolicyWithValue(
+            env=env,
+            observations=X,
+            latent=policy_latent,
+            vf_latent=vf_latent,
+            sess=sess,
+            estimate_q=estimate_q,
+            **extra_tensors
+        )
+        return policy
+    return policy_fn
+def _normalize_clip_observation(x, clip_range=[-5.0, 5.0]):
+    rms = RunningMeanStd(shape=x.shape[1:])
+    norm_x = tf.clip_by_value((x - rms.mean) / rms.std, min(clip_range), max(clip_range))
+    return norm_x, rms

baselines/common/retro_wrappers.py ADDED Viewed

	@@ -0,0 +1,280 @@

+from collections import deque
+import cv2
+cv2.ocl.setUseOpenCL(False)
+from .atari_wrappers import WarpFrame, ClipRewardEnv, FrameStack, ScaledFloatFrame
+from .wrappers import TimeLimit
+import numpy as np
+import gym
+class StochasticFrameSkip(gym.Wrapper):
+    def __init__(self, env, n, stickprob):
+        gym.Wrapper.__init__(self, env)
+        self.n = n
+        self.stickprob = stickprob
+        self.curac = None
+        self.rng = np.random.RandomState()
+        self.supports_want_render = hasattr(env, "supports_want_render")
+    def reset(self, **kwargs):
+        self.curac = None
+        return self.env.reset(**kwargs)
+    def step(self, ac):
+        done = False
+        totrew = 0
+        for i in range(self.n):
+            # First step after reset, use action
+            if self.curac is None:
+                self.curac = ac
+            # First substep, delay with probability=stickprob
+            elif i==0:
+                if self.rng.rand() > self.stickprob:
+                    self.curac = ac
+            # Second substep, new action definitely kicks in
+            elif i==1:
+                self.curac = ac
+            if self.supports_want_render and i<self.n-1:
+                ob, rew, done, info = self.env.step(self.curac, want_render=False)
+            else:
+                ob, rew, done, info = self.env.step(self.curac)
+            totrew += rew
+            if done: break
+        return ob, totrew, done, info
+    def seed(self, s):
+        self.rng.seed(s)
+class PartialFrameStack(gym.Wrapper):
+    def __init__(self, env, k, channel=1):
+        """
+        Stack one channel (channel keyword) from previous frames
+        """
+        gym.Wrapper.__init__(self, env)
+        shp = env.observation_space.shape
+        self.channel = channel
+        self.observation_space = gym.spaces.Box(low=0, high=255,
+            shape=(shp[0], shp[1], shp[2] + k - 1),
+            dtype=env.observation_space.dtype)
+        self.k = k
+        self.frames = deque([], maxlen=k)
+        shp = env.observation_space.shape
+    def reset(self):
+        ob = self.env.reset()
+        assert ob.shape[2] > self.channel
+        for _ in range(self.k):
+            self.frames.append(ob)
+        return self._get_ob()
+    def step(self, ac):
+        ob, reward, done, info = self.env.step(ac)
+        self.frames.append(ob)
+        return self._get_ob(), reward, done, info
+    def _get_ob(self):
+        assert len(self.frames) == self.k
+        return np.concatenate([frame if i==self.k-1 else frame[:,:,self.channel:self.channel+1]
+            for (i, frame) in enumerate(self.frames)], axis=2)
+class Downsample(gym.ObservationWrapper):
+    def __init__(self, env, ratio):
+        """
+        Downsample images by a factor of ratio
+        """
+        gym.ObservationWrapper.__init__(self, env)
+        (oldh, oldw, oldc) = env.observation_space.shape
+        newshape = (oldh//ratio, oldw//ratio, oldc)
+        self.observation_space = gym.spaces.Box(low=0, high=255,
+            shape=newshape, dtype=np.uint8)
+    def observation(self, frame):
+        height, width, _ = self.observation_space.shape
+        frame = cv2.resize(frame, (width, height), interpolation=cv2.INTER_AREA)
+        if frame.ndim == 2:
+            frame = frame[:,:,None]
+        return frame
+class Rgb2gray(gym.ObservationWrapper):
+    def __init__(self, env):
+        """
+        Downsample images by a factor of ratio
+        """
+        gym.ObservationWrapper.__init__(self, env)
+        (oldh, oldw, _oldc) = env.observation_space.shape
+        self.observation_space = gym.spaces.Box(low=0, high=255,
+            shape=(oldh, oldw, 1), dtype=np.uint8)
+    def observation(self, frame):
+        frame = cv2.cvtColor(frame, cv2.COLOR_RGB2GRAY)
+        return frame[:,:,None]
+class MovieRecord(gym.Wrapper):
+    def __init__(self, env, savedir, k):
+        gym.Wrapper.__init__(self, env)
+        self.savedir = savedir
+        self.k = k
+        self.epcount = 0
+    def reset(self):
+        if self.epcount % self.k == 0:
+            self.env.unwrapped.movie_path = self.savedir
+        else:
+            self.env.unwrapped.movie_path = None
+            self.env.unwrapped.movie = None
+        self.epcount += 1
+        return self.env.reset()
+class AppendTimeout(gym.Wrapper):
+    def __init__(self, env):
+        gym.Wrapper.__init__(self, env)
+        self.action_space = env.action_space
+        self.timeout_space = gym.spaces.Box(low=np.array([0.0]), high=np.array([1.0]), dtype=np.float32)
+        self.original_os = env.observation_space
+        if isinstance(self.original_os, gym.spaces.Dict):
+            import copy
+            ordered_dict = copy.deepcopy(self.original_os.spaces)
+            ordered_dict['value_estimation_timeout'] = self.timeout_space
+            self.observation_space = gym.spaces.Dict(ordered_dict)
+            self.dict_mode = True
+        else:
+            self.observation_space = gym.spaces.Dict({
+                'original': self.original_os,
+                'value_estimation_timeout': self.timeout_space
+                })
+            self.dict_mode = False
+        self.ac_count = None
+        while 1:
+            if not hasattr(env, "_max_episode_steps"):  # Looking for TimeLimit wrapper that has this field
+                env = env.env
+                continue
+            break
+        self.timeout = env._max_episode_steps
+    def step(self, ac):
+        self.ac_count += 1
+        ob, rew, done, info = self.env.step(ac)
+        return self._process(ob), rew, done, info
+    def reset(self):
+        self.ac_count = 0
+        return self._process(self.env.reset())
+    def _process(self, ob):
+        fracmissing = 1 - self.ac_count / self.timeout
+        if self.dict_mode:
+            ob['value_estimation_timeout'] = fracmissing
+        else:
+            return { 'original': ob, 'value_estimation_timeout': fracmissing }
+class StartDoingRandomActionsWrapper(gym.Wrapper):
+    """
+    Warning: can eat info dicts, not good if you depend on them
+    """
+    def __init__(self, env, max_random_steps, on_startup=True, every_episode=False):
+        gym.Wrapper.__init__(self, env)
+        self.on_startup = on_startup
+        self.every_episode = every_episode
+        self.random_steps = max_random_steps
+        self.last_obs = None
+        if on_startup:
+            self.some_random_steps()
+    def some_random_steps(self):
+        self.last_obs = self.env.reset()
+        n = np.random.randint(self.random_steps)
+        #print("running for random %i frames" % n)
+        for _ in range(n):
+            self.last_obs, _, done, _ = self.env.step(self.env.action_space.sample())
+            if done: self.last_obs = self.env.reset()
+    def reset(self):
+        return self.last_obs
+    def step(self, a):
+        self.last_obs, rew, done, info = self.env.step(a)
+        if done:
+            self.last_obs = self.env.reset()
+            if self.every_episode:
+                self.some_random_steps()
+        return self.last_obs, rew, done, info
+def make_retro(*, game, state=None, max_episode_steps=4500, **kwargs):
+    import retro
+    if state is None:
+        state = retro.State.DEFAULT
+    env = retro.make(game, state, **kwargs)
+    env = StochasticFrameSkip(env, n=4, stickprob=0.25)
+    if max_episode_steps is not None:
+        env = TimeLimit(env, max_episode_steps=max_episode_steps)
+    return env
+def wrap_deepmind_retro(env, scale=True, frame_stack=4):
+    """
+    Configure environment for retro games, using config similar to DeepMind-style Atari in wrap_deepmind
+    """
+    env = WarpFrame(env)
+    env = ClipRewardEnv(env)
+    if frame_stack > 1:
+        env = FrameStack(env, frame_stack)
+    if scale:
+        env = ScaledFloatFrame(env)
+    return env
+class SonicDiscretizer(gym.ActionWrapper):
+    """
+    Wrap a gym-retro environment and make it use discrete
+    actions for the Sonic game.
+    """
+    def __init__(self, env):
+        super(SonicDiscretizer, self).__init__(env)
+        buttons = ["B", "A", "MODE", "START", "UP", "DOWN", "LEFT", "RIGHT", "C", "Y", "X", "Z"]
+        actions = [['LEFT'], ['RIGHT'], ['LEFT', 'DOWN'], ['RIGHT', 'DOWN'], ['DOWN'],
+                   ['DOWN', 'B'], ['B']]
+        self._actions = []
+        for action in actions:
+            arr = np.array([False] * 12)
+            for button in action:
+                arr[buttons.index(button)] = True
+            self._actions.append(arr)
+        self.action_space = gym.spaces.Discrete(len(self._actions))
+    def action(self, a): # pylint: disable=W0221
+        return self._actions[a].copy()
+class RewardScaler(gym.RewardWrapper):
+    """
+    Bring rewards to a reasonable scale for PPO.
+    This is incredibly important and effects performance
+    drastically.
+    """
+    def __init__(self, env, scale=0.01):
+        super(RewardScaler, self).__init__(env)
+        self.scale = scale
+    def reward(self, reward):
+        return reward * self.scale
+class AllowBacktracking(gym.Wrapper):
+    """
+    Use deltas in max(X) as the reward, rather than deltas
+    in X. This way, agents are not discouraged too heavily
+    from exploring backwards if there is no way to advance
+    head-on in the level.
+    """
+    def __init__(self, env):
+        super(AllowBacktracking, self).__init__(env)
+        self._cur_x = 0
+        self._max_x = 0
+    def reset(self, **kwargs): # pylint: disable=E0202
+        self._cur_x = 0
+        self._max_x = 0
+        return self.env.reset(**kwargs)
+    def step(self, action): # pylint: disable=E0202
+        obs, rew, done, info = self.env.step(action)
+        self._cur_x += rew
+        rew = max(0, self._cur_x - self._max_x)
+        self._max_x = max(self._max_x, self._cur_x)
+        return obs, rew, done, info

baselines/common/runners.py ADDED Viewed

	@@ -0,0 +1,19 @@

+import numpy as np
+from abc import ABC, abstractmethod
+class AbstractEnvRunner(ABC):
+    def __init__(self, *, env, model, nsteps):
+        self.env = env
+        self.model = model
+        self.nenv = nenv = env.num_envs if hasattr(env, 'num_envs') else 1
+        self.batch_ob_shape = (nenv*nsteps,) + env.observation_space.shape
+        self.obs = np.zeros((nenv,) + env.observation_space.shape, dtype=env.observation_space.dtype.name)
+        self.obs[:] = env.reset()
+        self.nsteps = nsteps
+        self.states = model.initial_state
+        self.dones = [False for _ in range(nenv)]
+    @abstractmethod
+    def run(self):
+        raise NotImplementedError

baselines/common/running_mean_std.py ADDED Viewed

	@@ -0,0 +1,187 @@

+import tensorflow as tf
+import numpy as np
+from baselines.common.tf_util import get_session
+class RunningMeanStd(object):
+    # https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#Parallel_algorithm
+    def __init__(self, epsilon=1e-4, shape=()):
+        self.mean = np.zeros(shape, 'float64')
+        self.var = np.ones(shape, 'float64')
+        self.count = epsilon
+    def update(self, x):
+        batch_mean = np.mean(x, axis=0)
+        batch_var = np.var(x, axis=0)
+        batch_count = x.shape[0]
+        self.update_from_moments(batch_mean, batch_var, batch_count)
+    def update_from_moments(self, batch_mean, batch_var, batch_count):
+        self.mean, self.var, self.count = update_mean_var_count_from_moments(
+            self.mean, self.var, self.count, batch_mean, batch_var, batch_count)
+def update_mean_var_count_from_moments(mean, var, count, batch_mean, batch_var, batch_count):
+    delta = batch_mean - mean
+    tot_count = count + batch_count
+    new_mean = mean + delta * batch_count / tot_count
+    m_a = var * count
+    m_b = batch_var * batch_count
+    M2 = m_a + m_b + np.square(delta) * count * batch_count / tot_count
+    new_var = M2 / tot_count
+    new_count = tot_count
+    return new_mean, new_var, new_count
+class TfRunningMeanStd(object):
+    # https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#Parallel_algorithm
+    '''
+    TensorFlow variables-based implmentation of computing running mean and std
+    Benefit of this implementation is that it can be saved / loaded together with the tensorflow model
+    '''
+    def __init__(self, epsilon=1e-4, shape=(), scope=''):
+        sess = get_session()
+        self._new_mean = tf.compat.v1.placeholder(shape=shape, dtype=tf.float64)
+        self._new_var = tf.compat.v1.placeholder(shape=shape, dtype=tf.float64)
+        self._new_count = tf.compat.v1.placeholder(shape=(), dtype=tf.float64)
+        with tf.compat.v1.variable_scope(scope, reuse=tf.compat.v1.AUTO_REUSE):
+            self._mean  = tf.compat.v1.get_variable('mean',  initializer=np.zeros(shape, 'float64'),      dtype=tf.float64)
+            self._var   = tf.compat.v1.get_variable('std',   initializer=np.ones(shape, 'float64'),       dtype=tf.float64)
+            self._count = tf.compat.v1.get_variable('count', initializer=np.full((), epsilon, 'float64'), dtype=tf.float64)
+        self.update_ops = tf.group([
+            self._var.assign(self._new_var),
+            self._mean.assign(self._new_mean),
+            self._count.assign(self._new_count)
+        ])
+        sess.run(tf.compat.v1.variables_initializer([self._mean, self._var, self._count]))
+        self.sess = sess
+        self._set_mean_var_count()
+    def _set_mean_var_count(self):
+        self.mean, self.var, self.count = self.sess.run([self._mean, self._var, self._count])
+    def update(self, x):
+        batch_mean = np.mean(x, axis=0)
+        batch_var = np.var(x, axis=0)
+        batch_count = x.shape[0]
+        new_mean, new_var, new_count = update_mean_var_count_from_moments(self.mean, self.var, self.count, batch_mean, batch_var, batch_count)
+        self.sess.run(self.update_ops, feed_dict={
+            self._new_mean: new_mean,
+            self._new_var: new_var,
+            self._new_count: new_count
+        })
+        self._set_mean_var_count()
+def test_runningmeanstd():
+    for (x1, x2, x3) in [
+        (np.random.randn(3), np.random.randn(4), np.random.randn(5)),
+        (np.random.randn(3,2), np.random.randn(4,2), np.random.randn(5,2)),
+        ]:
+        rms = RunningMeanStd(epsilon=0.0, shape=x1.shape[1:])
+        x = np.concatenate([x1, x2, x3], axis=0)
+        ms1 = [x.mean(axis=0), x.var(axis=0)]
+        rms.update(x1)
+        rms.update(x2)
+        rms.update(x3)
+        ms2 = [rms.mean, rms.var]
+        np.testing.assert_allclose(ms1, ms2)
+def test_tf_runningmeanstd():
+    for (x1, x2, x3) in [
+        (np.random.randn(3), np.random.randn(4), np.random.randn(5)),
+        (np.random.randn(3,2), np.random.randn(4,2), np.random.randn(5,2)),
+        ]:
+        rms = TfRunningMeanStd(epsilon=0.0, shape=x1.shape[1:], scope='running_mean_std' + str(np.random.randint(0, 128)))
+        x = np.concatenate([x1, x2, x3], axis=0)
+        ms1 = [x.mean(axis=0), x.var(axis=0)]
+        rms.update(x1)
+        rms.update(x2)
+        rms.update(x3)
+        ms2 = [rms.mean, rms.var]
+        np.testing.assert_allclose(ms1, ms2)
+def profile_tf_runningmeanstd():
+    import time
+    from baselines.common import tf_util
+    tf_util.get_session( config=tf.compat.v1.ConfigProto(
+        inter_op_parallelism_threads=1,
+        intra_op_parallelism_threads=1,
+        allow_soft_placement=True
+    ))
+    x = np.random.random((376,))
+    n_trials = 10000
+    rms = RunningMeanStd()
+    tfrms = TfRunningMeanStd()
+    tic1 = time.time()
+    for _ in range(n_trials):
+        rms.update(x)
+    tic2 = time.time()
+    for _ in range(n_trials):
+        tfrms.update(x)
+    tic3 = time.time()
+    print('rms update time ({} trials): {} s'.format(n_trials, tic2 - tic1))
+    print('tfrms update time ({} trials): {} s'.format(n_trials, tic3 - tic2))
+    tic1 = time.time()
+    for _ in range(n_trials):
+        z1 = rms.mean
+    tic2 = time.time()
+    for _ in range(n_trials):
+        z2 = tfrms.mean
+    assert z1 == z2
+    tic3 = time.time()
+    print('rms get mean time ({} trials): {} s'.format(n_trials, tic2 - tic1))
+    print('tfrms get mean time ({} trials): {} s'.format(n_trials, tic3 - tic2))
+    '''
+    options = tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE) #pylint: disable=E1101
+    run_metadata = tf.RunMetadata()
+    profile_opts = dict(options=options, run_metadata=run_metadata)
+    from tensorflow.python.client import timeline
+    fetched_timeline = timeline.Timeline(run_metadata.step_stats) #pylint: disable=E1101
+    chrome_trace = fetched_timeline.generate_chrome_trace_format()
+    outfile = '/tmp/timeline.json'
+    with open(outfile, 'wt') as f:
+        f.write(chrome_trace)
+    print('Successfully saved profile to {}. Exiting.'.format(outfile))
+    exit(0)
+    '''
+if __name__ == '__main__':
+   profile_tf_runningmeanstd()

baselines/common/schedules.py ADDED Viewed

	@@ -0,0 +1,99 @@

+"""This file is used for specifying various schedules that evolve over
+time throughout the execution of the algorithm, such as:
+ - learning rate for the optimizer
+ - exploration epsilon for the epsilon greedy exploration strategy
+ - beta parameter for beta parameter in prioritized replay
+Each schedule has a function `value(t)` which returns the current value
+of the parameter given the timestep t of the optimization procedure.
+"""
+class Schedule(object):
+    def value(self, t):
+        """Value of the schedule at time t"""
+        raise NotImplementedError()
+class ConstantSchedule(object):
+    def __init__(self, value):
+        """Value remains constant over time.
+        Parameters
+        ----------
+        value: float
+            Constant value of the schedule
+        """
+        self._v = value
+    def value(self, t):
+        """See Schedule.value"""
+        return self._v
+def linear_interpolation(l, r, alpha):
+    return l + alpha * (r - l)
+class PiecewiseSchedule(object):
+    def __init__(self, endpoints, interpolation=linear_interpolation, outside_value=None):
+        """Piecewise schedule.
+        endpoints: [(int, int)]
+            list of pairs `(time, value)` meanining that schedule should output
+            `value` when `t==time`. All the values for time must be sorted in
+            an increasing order. When t is between two times, e.g. `(time_a, value_a)`
+            and `(time_b, value_b)`, such that `time_a <= t < time_b` then value outputs
+            `interpolation(value_a, value_b, alpha)` where alpha is a fraction of
+            time passed between `time_a` and `time_b` for time `t`.
+        interpolation: lambda float, float, float: float
+            a function that takes value to the left and to the right of t according
+            to the `endpoints`. Alpha is the fraction of distance from left endpoint to
+            right endpoint that t has covered. See linear_interpolation for example.
+        outside_value: float
+            if the value is requested outside of all the intervals sepecified in
+            `endpoints` this value is returned. If None then AssertionError is
+            raised when outside value is requested.
+        """
+        idxes = [e[0] for e in endpoints]
+        assert idxes == sorted(idxes)
+        self._interpolation = interpolation
+        self._outside_value = outside_value
+        self._endpoints = endpoints
+    def value(self, t):
+        """See Schedule.value"""
+        for (l_t, l), (r_t, r) in zip(self._endpoints[:-1], self._endpoints[1:]):
+            if l_t <= t and t < r_t:
+                alpha = float(t - l_t) / (r_t - l_t)
+                return self._interpolation(l, r, alpha)
+        # t does not belong to any of the pieces, so doom.
+        assert self._outside_value is not None
+        return self._outside_value
+class LinearSchedule(object):
+    def __init__(self, schedule_timesteps, final_p, initial_p=1.0):
+        """Linear interpolation between initial_p and final_p over
+        schedule_timesteps. After this many timesteps pass final_p is
+        returned.
+        Parameters
+        ----------
+        schedule_timesteps: int
+            Number of timesteps for which to linearly anneal initial_p
+            to final_p
+        initial_p: float
+            initial output value
+        final_p: float
+            final output value
+        """
+        self.schedule_timesteps = schedule_timesteps
+        self.final_p = final_p
+        self.initial_p = initial_p
+    def value(self, t):
+        """See Schedule.value"""
+        fraction = min(float(t) / self.schedule_timesteps, 1.0)
+        return self.initial_p + fraction * (self.final_p - self.initial_p)