| # This file is used to configure logging and agents behaviour. | |
| # | |
| # The first part consists of Wandb info used to log experiments. | |
| # Changing it adjusts the way logging is stored and displayed. | |
| # | |
| # The second part (config) is used to change hyperparameter settings of agents. | |
| # Changing it adjusts the way agents behave and learn. | |
| project: "Hopper-v5" | |
| name: "PPO" | |
| dir: "../logs" | |
| notes: "Training Hopper-v5 using PPO" | |
| mode: "online" | |
| monitor_gym: "False" | |
| config: | |
| # Environment, logging and saving control | |
| environment: "Hopper-v5" # Environment to use | |
| algorithm: "PPO Continuous" # What kind of algorithm to use? | |
| save_dir: "../models/" # Where to save model? | |
| save_name: "ppo_hopper" # Model name | |
| save_interval: 50 # How many previous episodes will be used to calculate mean reward? | |
| total_steps: 1_000_000 # For how many steps will the agent train? | |
| episode_steps: 1250 # How many steps before the episode is terminated? | |
| # Algorithm hyperparameters | |
| gamma: 0.999 # Discount factor for future rewards | |
| lambda: 0.99 # GAE tradeoff parameter | |
| ppo_epochs: 8 # How many epochs to train on each batch? | |
| rollout_length: 512 # How many steps to collect before updating? | |
| batch_size: 32 # How many steps are in each batch? | |
| clip_epsilon: 0.2 # How much is the policy clipped? | |
| learning_rate_actor: 0.0001 # Learning rate for actor head | |
| learning_rate_critic: 0.0001 # Learning rate for critic head | |
| value_loss_coef: 0.8 # How much is the value loss weighted? | |
| entropy_coef: 0.0025 # How much is the entropy loss weighted? | |
| max_grad_norm: 0.5 # Maximum norm for gradient clipping | |
| network_size: 256 # Number of neurons in each hidden layer |