from stable_baselines3.common.env_util import make_vec_env from stable_baselines3.common.evaluation import evaluate_policy from stable_baselines3.common.vec_env import VecNormalize from stable_baselines3 import PPO from stable_baselines3.common.callbacks import EvalCallback # Create 20 parallel environments for training and normalize the rewards env = make_vec_env('LunarLander-v2', n_envs=20) env = VecNormalize(env, norm_reward=True) # Create the evaluation environment and apply VecNormalize eval_env = make_vec_env('LunarLander-v2', n_envs=1) eval_env = VecNormalize(eval_env, norm_reward=True) # Callback for evaluation and saving the best model eval_callback = EvalCallback(eval_env, best_model_save_path='./logs/best_model', log_path='./logs/', eval_freq=10_000, deterministic=True, render=False) # Initialize the PPO model with adjusted hyperparameters model = PPO("MlpPolicy", env, verbose=1, device="cuda", batch_size=128, # Larger batch size n_steps=4096, gae_lambda=0.98, gamma=0.999, n_epochs=4, ent_coef=0.01) # Path to store TensorBoard logs # Train the model for 4 million timesteps model.learn(total_timesteps=5_000_000, callback=eval_callback) # Save the trained model model_name = "ppo-LunarLander-v2" model.save(model_name) env.save("vecnormalize.pkl") # Load the best model best_model = PPO.load("./logs/best_model/best_model.zip") # Normalize the evaluation environment (using the same normalization as training) eval_env.training = False eval_env.norm_reward = False # Turn off normalization for evaluation # Evaluate the best model mean_reward, std_reward = evaluate_policy(best_model, eval_env, n_eval_episodes=100, deterministic=True) print(f"Best model mean_reward={mean_reward:.2f} +/- {std_reward}")