import numpy as np from apple.envs.discrete_apple import get_apple_env def test_discrete_apple_phase1(): c = 0.5 timelimit = 30 env = get_apple_env("phase1", start_x=0, goal_x=10, c=c, time_limit=timelimit) observations, actions, rewards, done = [], [], [], False obs = env.reset() for i in range(timelimit): action = np.random.choice([0, 1], p=[0.2, 0.8]) obs, reward, done, info = env.step(action) observations.append(obs) actions.append(action) rewards.append(reward) if done: break observations = np.array(observations) actions = np.array(actions) rewards = np.array(rewards) target_rewards = np.ones(len(actions)) * actions * 2 - 1 if info["success"]: target_rewards[-1] = 100 target_states = np.stack([np.ones(len(actions)), np.ones(len(actions)) * -c], axis=1) assert (rewards == target_rewards).all() assert (observations == target_states).all() def test_discrete_apple_phase2(): c = 0.5 timelimit = 30 env = get_apple_env("phase2", start_x=0, goal_x=10, c=c, time_limit=timelimit) observations, actions, rewards, done = [], [], [], False obs = env.reset() for i in range(timelimit): action = np.random.choice([0, 1], p=[0.8, 0.2]) obs, reward, done, info = env.step(action) observations.append(obs) actions.append(action) rewards.append(reward) if done: break observations = np.array(observations) actions = np.array(actions) rewards = np.array(rewards) target_rewards = np.ones(len(actions)) * (1 - actions) * 2 - 1 if info["success"]: target_rewards[-1] = 100 target_states = np.stack([np.ones(len(actions)), np.ones(len(actions)) * c], axis=1) assert (rewards == target_rewards).all() assert (observations == target_states).all() def test_discrete_apple_full(): c = 0.5 target_rewards = np.ones(20) target_rewards[-1] = 100 target_states = np.stack([np.ones(20), np.concatenate([np.ones(10) * -c, np.ones(10) * c])], axis=1) env = get_apple_env("full", start_x=0, goal_x=10, c=c, time_limit=30) observations, actions, rewards = [], [], [] obs = env.reset() for i in range(10): action = 1 obs, reward, done, info = env.step(action) observations.append(obs) actions.append(action) rewards.append(reward) for i in range(10): action = 0 obs, reward, done, info = env.step(action) observations.append(obs) actions.append(action) rewards.append(reward) rewards = np.array(rewards) observations = np.array(observations) assert (rewards == target_rewards).all() assert (observations == target_states).all()