lunar2 / ppo.py
loisonchambers's picture
Upload folder using huggingface_hub
12d0891 verified
import os
import gymnasium as gym
import numpy as np # Ensure numpy is imported
import torch
import torch.nn as nn
import torch.optim as optim
from torch.distributions.categorical import Categorical
# --- Hyperparameters ---
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
env_id = "LunarLander-v2"
total_timesteps = 500000
learning_rate = 2.5e-4
num_envs = 4
num_steps = 128
batch_size = num_envs * num_steps
minibatch_size = 32
update_epochs = 4
clip_coef = 0.2
ent_coef = 0.01
# --- Model Architecture ---
def layer_init(layer, std=np.sqrt(2), bias_const=0.0):
torch.nn.init.orthogonal_(layer.weight, std)
torch.nn.init.constant_(layer.bias, bias_const)
return layer
class Agent(nn.Module):
def __init__(self, envs):
super().__init__()
self.critic = nn.Sequential(
layer_init(
nn.Linear(
np.array(envs.single_observation_space.shape).prod(), 64
)
),
nn.Tanh(),
layer_init(nn.Linear(64, 64)),
nn.Tanh(),
layer_init(nn.Linear(64, 1), std=1.0),
)
self.actor = nn.Sequential(
layer_init(
nn.Linear(
np.array(envs.single_observation_space.shape).prod(), 64
)
),
nn.Tanh(),
layer_init(nn.Linear(64, 64)),
nn.Tanh(),
layer_init(nn.Linear(64, envs.single_action_space.n), std=0.01),
)
def get_value(self, x):
return self.critic(x)
def get_action_and_value(self, x, action=None):
logits = self.actor(x)
probs = Categorical(logits=logits)
if action is None:
action = probs.sample()
return action, probs.log_prob(action), probs.entropy(), self.critic(x)
# --- Training Loop Setup ---
if __name__ == "__main__":
envs = gym.vector.SyncVectorEnv(
[lambda: gym.make(env_id) for _ in range(num_envs)]
)
agent = Agent(envs).to(device)
optimizer = optim.Adam(agent.parameters(), lr=learning_rate, eps=1e-5)
# Storage setup
obs = torch.zeros(
(num_steps, num_envs) + envs.single_observation_space.shape
).to(device)
actions = torch.zeros(
(num_steps, num_envs) + envs.single_action_space.shape
).to(device)
logprobs = torch.zeros((num_steps, num_envs)).to(device)
rewards = torch.zeros((num_steps, num_envs)).to(device)
dones = torch.zeros((num_steps, num_envs)).to(device)
values = torch.zeros((num_steps, num_envs)).to(device)
global_step = 0
next_obs, _ = envs.reset()
next_obs = torch.Tensor(next_obs).to(device)
next_done = torch.zeros(num_envs).to(device)
for iteration in range(1, total_timesteps // batch_size + 1):
# 1. Rollout phase
for step in range(num_steps):
global_step += num_envs
obs[step] = next_obs
dones[step] = next_done
with torch.no_grad():
action, logprob, _, value = agent.get_action_and_value(
next_obs
)
values[step] = value.flatten()
actions[step] = action
logprobs[step] = logprob
next_obs, reward, terminations, activations, infos = envs.step(
action.cpu().numpy()
)
next_done = np.logical_or(terminations, activations)
rewards[step] = torch.tensor(reward).to(device).view(-1)
next_obs, next_done = (
torch.Tensor(next_obs).to(device),
torch.Tensor(next_done).to(device),
)
# 2. Advantage Calculation (GAE can be added here, simplified for brevity)
# 3. PPO Update Logic (Actor and Critic Loss)
# ... [Policy Update Logic goes here] ...
print(f"Step: {global_step} | Training...")
# --- Save and Upload ---
model_name = "ppo-LunarLander-v2"
torch.save(agent.state_dict(), f"{model_name}.pt")
# Upload to Hub
# Replace 'your-username' with your actual HF username
repo_id = f"your-username/{model_name}"
push_to_hub_fast(
repo_id=repo_id, folder_path="./", token=os.getenv("HF_TOKEN")
)