# This file is used to configure logging and agents behaviour. # # The first part consists of Wandb info used to log experiments. # Changing it adjusts the way logging is stored and displayed. # # The second part (config) is used to change hyperparameter settings of agents. # Changing it adjusts the way agents behave and learn. project: "Hopper-v5" name: "PPO" dir: "../logs" notes: "Training Hopper-v5 using PPO" mode: "online" monitor_gym: "False" config: # Environment, logging and saving control environment: "Hopper-v5" # Environment to use algorithm: "PPO Continuous" # What kind of algorithm to use? save_dir: "../models/" # Where to save model? save_name: "ppo_hopper" # Model name save_interval: 50 # How many previous episodes will be used to calculate mean reward? total_steps: 1_000_000 # For how many steps will the agent train? episode_steps: 1250 # How many steps before the episode is terminated? # Algorithm hyperparameters gamma: 0.999 # Discount factor for future rewards lambda: 0.99 # GAE tradeoff parameter ppo_epochs: 8 # How many epochs to train on each batch? rollout_length: 512 # How many steps to collect before updating? batch_size: 32 # How many steps are in each batch? clip_epsilon: 0.2 # How much is the policy clipped? learning_rate_actor: 0.0001 # Learning rate for actor head learning_rate_critic: 0.0001 # Learning rate for critic head value_loss_coef: 0.8 # How much is the value loss weighted? entropy_coef: 0.0025 # How much is the entropy loss weighted? max_grad_norm: 0.5 # Maximum norm for gradient clipping network_size: 256 # Number of neurons in each hidden layer