pkalkman's picture
retrained for 5000000
ea6ff71
from stable_baselines3.common.env_util import make_vec_env
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.vec_env import VecNormalize
from stable_baselines3 import PPO
from stable_baselines3.common.callbacks import EvalCallback
# Create 20 parallel environments for training and normalize the rewards
env = make_vec_env('LunarLander-v2', n_envs=20)
env = VecNormalize(env, norm_reward=True)
# Create the evaluation environment and apply VecNormalize
eval_env = make_vec_env('LunarLander-v2', n_envs=1)
eval_env = VecNormalize(eval_env, norm_reward=True)
# Callback for evaluation and saving the best model
eval_callback = EvalCallback(eval_env, best_model_save_path='./logs/best_model',
log_path='./logs/', eval_freq=10_000,
deterministic=True, render=False)
# Initialize the PPO model with adjusted hyperparameters
model = PPO("MlpPolicy", env, verbose=1, device="cuda",
batch_size=128, # Larger batch size
n_steps=4096,
gae_lambda=0.98,
gamma=0.999,
n_epochs=4,
ent_coef=0.01) # Path to store TensorBoard logs
# Train the model for 4 million timesteps
model.learn(total_timesteps=5_000_000, callback=eval_callback)
# Save the trained model
model_name = "ppo-LunarLander-v2"
model.save(model_name)
env.save("vecnormalize.pkl")
# Load the best model
best_model = PPO.load("./logs/best_model/best_model.zip")
# Normalize the evaluation environment (using the same normalization as training)
eval_env.training = False
eval_env.norm_reward = False # Turn off normalization for evaluation
# Evaluate the best model
mean_reward, std_reward = evaluate_policy(best_model, eval_env, n_eval_episodes=100, deterministic=True)
print(f"Best model mean_reward={mean_reward:.2f} +/- {std_reward}")