| # High GPU utilization configuration | |
| env_id: 'ALE/Pacman-v5' # Gymnasium environment to train on | |
| total_episodes: 100 # Total number of episodes to train for | |
| num_envs: 256 # Number of parallel environments for vectorized training | |
| num_steps: 128 # Number of steps to run per update | |
| seed: 42 # Random seed for reproducibility | |
| torch_deterministic: true # Whether to use deterministic PyTorch operations | |
| cuda: true # Whether to use GPU acceleration | |
| max_episode_steps: 1000 # Maximum steps per episode (truncates long episodes) | |
| trajectory_save_every_n_frames: 4 # Save every Nth frame | |
| save_trajectories: false # Whether to save episode trajectories | |
| save_every_n_updates: 1 # Save trajectories every N updates | |
| n_envs_to_save: 12 # Number of parallel environments to save trajectories for | |
| save_gradients: false # Whether to save gradients | |
| # agent.py | |
| learning_rate: 1.0e-3 # Learning rate for the Adam optimizer | |
| gamma: 0.99 # Discount factor for future rewards | |
| gae_lambda: 0.95 # GAE (Generalized Advantage Estimation) lambda parameter | |
| minibatch_size: 512 # Manual minibatch size | |
| update_epochs: 8 # Number of epochs to update the policy per batch | |
| clip_coef: 0.2 # PPO clipping coefficient (epsilon). Default 0.2 | |
| ent_coef: 0.01 # Entropy coefficient for exploration bonus | |
| vf_coef: 0.5 # Value function loss coefficient | |
| max_grad_norm: 0.5 # Maximum gradient norm for gradient clipping | |
| target_kl: null # Target KL divergence (null = no early stopping) | |
| optimizer_eps: 1.0e-4 # Epsilon parameter for Adam optimizer | |
| hidden_sizes: [256] # List of hidden layer sizes for MLP (will override hidden_size if specified) | |
| cnn_layers: | |
| - { out_channels: 16, kernel_size: 8, stride: 4 } | |
| - { out_channels: 32, kernel_size: 4, stride: 2 } | |
| - { out_channels: 64, kernel_size: 3, stride: 2 } | |
| layer_init_std: 1.4142135623730951 # Standard deviation for layer weight initialization (sqrt(2)) | |
| actor_std: 0.01 # Standard deviation for actor head initialization | |
| critic_std: 1 # Standard deviation for critic head initialization | |
| # logging | |
| wandb_project_name: "pacman-ppo" # Weights & Biases project name for logging | |
| wandb_entity: null # Weights & Biases entity/username (null = default) | |
| # video saving (not used) | |
| capture_video: false # Whether to capture videos of agent performance | |
| save_video_freq: 5 # Save video every N updates | |
| video_length: 1000 # Maximum number of frames to record per video | |
| # huggingface | |
| upload_to_hf: true | |
| hf_repo_id: "jacobcd52/pacman_ppo" # replace with your huggingface username and repo name |