File size: 4,250 Bytes
12d0891 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 | import os
import gymnasium as gym
import numpy as np # Ensure numpy is imported
import torch
import torch.nn as nn
import torch.optim as optim
from torch.distributions.categorical import Categorical
# --- Hyperparameters ---
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
env_id = "LunarLander-v2"
total_timesteps = 500000
learning_rate = 2.5e-4
num_envs = 4
num_steps = 128
batch_size = num_envs * num_steps
minibatch_size = 32
update_epochs = 4
clip_coef = 0.2
ent_coef = 0.01
# --- Model Architecture ---
def layer_init(layer, std=np.sqrt(2), bias_const=0.0):
torch.nn.init.orthogonal_(layer.weight, std)
torch.nn.init.constant_(layer.bias, bias_const)
return layer
class Agent(nn.Module):
def __init__(self, envs):
super().__init__()
self.critic = nn.Sequential(
layer_init(
nn.Linear(
np.array(envs.single_observation_space.shape).prod(), 64
)
),
nn.Tanh(),
layer_init(nn.Linear(64, 64)),
nn.Tanh(),
layer_init(nn.Linear(64, 1), std=1.0),
)
self.actor = nn.Sequential(
layer_init(
nn.Linear(
np.array(envs.single_observation_space.shape).prod(), 64
)
),
nn.Tanh(),
layer_init(nn.Linear(64, 64)),
nn.Tanh(),
layer_init(nn.Linear(64, envs.single_action_space.n), std=0.01),
)
def get_value(self, x):
return self.critic(x)
def get_action_and_value(self, x, action=None):
logits = self.actor(x)
probs = Categorical(logits=logits)
if action is None:
action = probs.sample()
return action, probs.log_prob(action), probs.entropy(), self.critic(x)
# --- Training Loop Setup ---
if __name__ == "__main__":
envs = gym.vector.SyncVectorEnv(
[lambda: gym.make(env_id) for _ in range(num_envs)]
)
agent = Agent(envs).to(device)
optimizer = optim.Adam(agent.parameters(), lr=learning_rate, eps=1e-5)
# Storage setup
obs = torch.zeros(
(num_steps, num_envs) + envs.single_observation_space.shape
).to(device)
actions = torch.zeros(
(num_steps, num_envs) + envs.single_action_space.shape
).to(device)
logprobs = torch.zeros((num_steps, num_envs)).to(device)
rewards = torch.zeros((num_steps, num_envs)).to(device)
dones = torch.zeros((num_steps, num_envs)).to(device)
values = torch.zeros((num_steps, num_envs)).to(device)
global_step = 0
next_obs, _ = envs.reset()
next_obs = torch.Tensor(next_obs).to(device)
next_done = torch.zeros(num_envs).to(device)
for iteration in range(1, total_timesteps // batch_size + 1):
# 1. Rollout phase
for step in range(num_steps):
global_step += num_envs
obs[step] = next_obs
dones[step] = next_done
with torch.no_grad():
action, logprob, _, value = agent.get_action_and_value(
next_obs
)
values[step] = value.flatten()
actions[step] = action
logprobs[step] = logprob
next_obs, reward, terminations, activations, infos = envs.step(
action.cpu().numpy()
)
next_done = np.logical_or(terminations, activations)
rewards[step] = torch.tensor(reward).to(device).view(-1)
next_obs, next_done = (
torch.Tensor(next_obs).to(device),
torch.Tensor(next_done).to(device),
)
# 2. Advantage Calculation (GAE can be added here, simplified for brevity)
# 3. PPO Update Logic (Actor and Critic Loss)
# ... [Policy Update Logic goes here] ...
print(f"Step: {global_step} | Training...")
# --- Save and Upload ---
model_name = "ppo-LunarLander-v2"
torch.save(agent.state_dict(), f"{model_name}.pt")
# Upload to Hub
# Replace 'your-username' with your actual HF username
repo_id = f"your-username/{model_name}"
push_to_hub_fast(
repo_id=repo_id, folder_path="./", token=os.getenv("HF_TOKEN")
)
|