import tensorflow as tf import numpy as np from baselines.common.vec_env.dummy_vec_env import DummyVecEnv N_TRIALS = 10000 N_EPISODES = 100 _sess_config = tf.compat.v1.ConfigProto( allow_soft_placement=True, intra_op_parallelism_threads=1, inter_op_parallelism_threads=1 ) def simple_test(env_fn, learn_fn, min_reward_fraction, n_trials=N_TRIALS): def seeded_env_fn(): env = env_fn() env.seed(0) return env np.random.seed(0) env = DummyVecEnv([seeded_env_fn]) with tf.Graph().as_default(), tf.compat.v1.Session(config=_sess_config).as_default(): tf.compat.v1.set_random_seed(0) model = learn_fn(env) sum_rew = 0 done = True for i in range(n_trials): if done: obs = env.reset() state = model.initial_state if state is not None: a, v, state, _ = model.step(obs, S=state, M=[False]) else: a, v, _, _ = model.step(obs) obs, rew, done, _ = env.step(a) sum_rew += float(rew) print("Reward in {} trials is {}".format(n_trials, sum_rew)) assert sum_rew > min_reward_fraction * n_trials, \ 'sum of rewards {} is less than {} of the total number of trials {}'.format(sum_rew, min_reward_fraction, n_trials) def reward_per_episode_test(env_fn, learn_fn, min_avg_reward, n_trials=N_EPISODES): env = DummyVecEnv([env_fn]) with tf.Graph().as_default(), tf.compat.v1.Session(config=_sess_config).as_default(): model = learn_fn(env) N_TRIALS = 100 observations, actions, rewards = rollout(env, model, N_TRIALS) rewards = [sum(r) for r in rewards] avg_rew = sum(rewards) / N_TRIALS print("Average reward in {} episodes is {}".format(n_trials, avg_rew)) assert avg_rew > min_avg_reward, \ 'average reward in {} episodes ({}) is less than {}'.format(n_trials, avg_rew, min_avg_reward) def rollout(env, model, n_trials): rewards = [] actions = [] observations = [] for i in range(n_trials): obs = env.reset() state = model.initial_state if hasattr(model, 'initial_state') else None episode_rew = [] episode_actions = [] episode_obs = [] while True: if state is not None: a, v, state, _ = model.step(obs, S=state, M=[False]) else: a,v, _, _ = model.step(obs) obs, rew, done, _ = env.step(a) episode_rew.append(rew) episode_actions.append(a) episode_obs.append(obs) if done: break rewards.append(episode_rew) actions.append(episode_actions) observations.append(episode_obs) return observations, actions, rewards def smoketest(argstr, **kwargs): import tempfile import subprocess import os argstr = 'python -m baselines.run ' + argstr for key, value in kwargs: argstr += ' --{}={}'.format(key, value) tempdir = tempfile.mkdtemp() env = os.environ.copy() env['OPENAI_LOGDIR'] = tempdir subprocess.run(argstr.split(' '), env=env) return tempdir