|
|
from stable_baselines3.common.env_util import make_vec_env |
|
|
from stable_baselines3.common.evaluation import evaluate_policy |
|
|
from stable_baselines3.common.vec_env import VecNormalize |
|
|
from stable_baselines3 import PPO |
|
|
from stable_baselines3.common.callbacks import EvalCallback |
|
|
|
|
|
|
|
|
|
|
|
env = make_vec_env('LunarLander-v2', n_envs=20) |
|
|
env = VecNormalize(env, norm_reward=True) |
|
|
|
|
|
|
|
|
eval_env = make_vec_env('LunarLander-v2', n_envs=1) |
|
|
eval_env = VecNormalize(eval_env, norm_reward=True) |
|
|
|
|
|
|
|
|
eval_callback = EvalCallback(eval_env, best_model_save_path='./logs/best_model', |
|
|
log_path='./logs/', eval_freq=10_000, |
|
|
deterministic=True, render=False) |
|
|
|
|
|
|
|
|
model = PPO("MlpPolicy", env, verbose=1, device="cuda", |
|
|
batch_size=128, |
|
|
n_steps=4096, |
|
|
gae_lambda=0.98, |
|
|
gamma=0.999, |
|
|
n_epochs=4, |
|
|
ent_coef=0.01) |
|
|
|
|
|
|
|
|
|
|
|
model.learn(total_timesteps=5_000_000, callback=eval_callback) |
|
|
|
|
|
|
|
|
model_name = "ppo-LunarLander-v2" |
|
|
model.save(model_name) |
|
|
|
|
|
env.save("vecnormalize.pkl") |
|
|
|
|
|
|
|
|
best_model = PPO.load("./logs/best_model/best_model.zip") |
|
|
|
|
|
|
|
|
eval_env.training = False |
|
|
eval_env.norm_reward = False |
|
|
|
|
|
|
|
|
mean_reward, std_reward = evaluate_policy(best_model, eval_env, n_eval_episodes=100, deterministic=True) |
|
|
print(f"Best model mean_reward={mean_reward:.2f} +/- {std_reward}") |
|
|
|