| import os |
|
|
| import gymnasium as gym |
| import numpy as np |
| import torch |
| import torch.nn as nn |
| import torch.optim as optim |
| from torch.distributions.categorical import Categorical |
|
|
| |
| device = torch.device("cuda" if torch.cuda.is_available() else "cpu") |
| env_id = "LunarLander-v2" |
| total_timesteps = 500000 |
| learning_rate = 2.5e-4 |
| num_envs = 4 |
| num_steps = 128 |
| batch_size = num_envs * num_steps |
| minibatch_size = 32 |
| update_epochs = 4 |
| clip_coef = 0.2 |
| ent_coef = 0.01 |
|
|
|
|
| |
| def layer_init(layer, std=np.sqrt(2), bias_const=0.0): |
| torch.nn.init.orthogonal_(layer.weight, std) |
| torch.nn.init.constant_(layer.bias, bias_const) |
| return layer |
|
|
|
|
| class Agent(nn.Module): |
| def __init__(self, envs): |
| super().__init__() |
| self.critic = nn.Sequential( |
| layer_init( |
| nn.Linear( |
| np.array(envs.single_observation_space.shape).prod(), 64 |
| ) |
| ), |
| nn.Tanh(), |
| layer_init(nn.Linear(64, 64)), |
| nn.Tanh(), |
| layer_init(nn.Linear(64, 1), std=1.0), |
| ) |
| self.actor = nn.Sequential( |
| layer_init( |
| nn.Linear( |
| np.array(envs.single_observation_space.shape).prod(), 64 |
| ) |
| ), |
| nn.Tanh(), |
| layer_init(nn.Linear(64, 64)), |
| nn.Tanh(), |
| layer_init(nn.Linear(64, envs.single_action_space.n), std=0.01), |
| ) |
|
|
| def get_value(self, x): |
| return self.critic(x) |
|
|
| def get_action_and_value(self, x, action=None): |
| logits = self.actor(x) |
| probs = Categorical(logits=logits) |
| if action is None: |
| action = probs.sample() |
| return action, probs.log_prob(action), probs.entropy(), self.critic(x) |
|
|
|
|
| |
| if __name__ == "__main__": |
| envs = gym.vector.SyncVectorEnv( |
| [lambda: gym.make(env_id) for _ in range(num_envs)] |
| ) |
| agent = Agent(envs).to(device) |
| optimizer = optim.Adam(agent.parameters(), lr=learning_rate, eps=1e-5) |
|
|
| |
| obs = torch.zeros( |
| (num_steps, num_envs) + envs.single_observation_space.shape |
| ).to(device) |
| actions = torch.zeros( |
| (num_steps, num_envs) + envs.single_action_space.shape |
| ).to(device) |
| logprobs = torch.zeros((num_steps, num_envs)).to(device) |
| rewards = torch.zeros((num_steps, num_envs)).to(device) |
| dones = torch.zeros((num_steps, num_envs)).to(device) |
| values = torch.zeros((num_steps, num_envs)).to(device) |
|
|
| global_step = 0 |
| next_obs, _ = envs.reset() |
| next_obs = torch.Tensor(next_obs).to(device) |
| next_done = torch.zeros(num_envs).to(device) |
|
|
| for iteration in range(1, total_timesteps // batch_size + 1): |
| |
| for step in range(num_steps): |
| global_step += num_envs |
| obs[step] = next_obs |
| dones[step] = next_done |
|
|
| with torch.no_grad(): |
| action, logprob, _, value = agent.get_action_and_value( |
| next_obs |
| ) |
| values[step] = value.flatten() |
| actions[step] = action |
| logprobs[step] = logprob |
|
|
| next_obs, reward, terminations, activations, infos = envs.step( |
| action.cpu().numpy() |
| ) |
| next_done = np.logical_or(terminations, activations) |
| rewards[step] = torch.tensor(reward).to(device).view(-1) |
| next_obs, next_done = ( |
| torch.Tensor(next_obs).to(device), |
| torch.Tensor(next_done).to(device), |
| ) |
|
|
| |
| |
| |
|
|
| print(f"Step: {global_step} | Training...") |
|
|
| |
| model_name = "ppo-LunarLander-v2" |
| torch.save(agent.state_dict(), f"{model_name}.pt") |
|
|
| |
| |
| repo_id = f"your-username/{model_name}" |
| push_to_hub_fast( |
| repo_id=repo_id, folder_path="./", token=os.getenv("HF_TOKEN") |
| ) |
|
|