File size: 3,221 Bytes
5960497
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
import tensorflow as tf
import numpy as np
from baselines.common.vec_env.dummy_vec_env import DummyVecEnv

N_TRIALS = 10000
N_EPISODES = 100

_sess_config = tf.compat.v1.ConfigProto(
    allow_soft_placement=True,
    intra_op_parallelism_threads=1,
    inter_op_parallelism_threads=1
)

def simple_test(env_fn, learn_fn, min_reward_fraction, n_trials=N_TRIALS):
    def seeded_env_fn():
        env = env_fn()
        env.seed(0)
        return env

    np.random.seed(0)
    env = DummyVecEnv([seeded_env_fn])
    with tf.Graph().as_default(), tf.compat.v1.Session(config=_sess_config).as_default():
        tf.compat.v1.set_random_seed(0)
        model = learn_fn(env)
        sum_rew = 0
        done = True
        for i in range(n_trials):
            if done:
                obs = env.reset()
                state = model.initial_state
            if state is not None:
                a, v, state, _ = model.step(obs, S=state, M=[False])
            else:
                a, v, _, _ = model.step(obs)
            obs, rew, done, _ = env.step(a)
            sum_rew += float(rew)
        print("Reward in {} trials is {}".format(n_trials, sum_rew))
        assert sum_rew > min_reward_fraction * n_trials, \
            'sum of rewards {} is less than {} of the total number of trials {}'.format(sum_rew, min_reward_fraction, n_trials)

def reward_per_episode_test(env_fn, learn_fn, min_avg_reward, n_trials=N_EPISODES):
    env = DummyVecEnv([env_fn])
    with tf.Graph().as_default(), tf.compat.v1.Session(config=_sess_config).as_default():
        model = learn_fn(env)
        N_TRIALS = 100
        observations, actions, rewards = rollout(env, model, N_TRIALS)
        rewards = [sum(r) for r in rewards]
        avg_rew = sum(rewards) / N_TRIALS
        print("Average reward in {} episodes is {}".format(n_trials, avg_rew))
        assert avg_rew > min_avg_reward, \
            'average reward in {} episodes ({}) is less than {}'.format(n_trials, avg_rew, min_avg_reward)

def rollout(env, model, n_trials):
    rewards = []
    actions = []
    observations = []
    for i in range(n_trials):
        obs = env.reset()
        state = model.initial_state if hasattr(model, 'initial_state') else None
        episode_rew = []
        episode_actions = []
        episode_obs = []
        while True:
            if state is not None:
                a, v, state, _ = model.step(obs, S=state, M=[False])
            else:
                a,v, _, _ = model.step(obs)

            obs, rew, done, _ = env.step(a)
            episode_rew.append(rew)
            episode_actions.append(a)
            episode_obs.append(obs)
            if done:
                break
        rewards.append(episode_rew)
        actions.append(episode_actions)
        observations.append(episode_obs)
    return observations, actions, rewards


def smoketest(argstr, **kwargs):
    import tempfile
    import subprocess
    import os
    argstr = 'python -m baselines.run ' + argstr
    for key, value in kwargs:
        argstr += ' --{}={}'.format(key, value)
    tempdir = tempfile.mkdtemp()
    env = os.environ.copy()
    env['OPENAI_LOGDIR'] = tempdir
    subprocess.run(argstr.split(' '), env=env)
    return tempdir