File size: 4,250 Bytes

12d0891

import os

import gymnasium as gym
import numpy as np  # Ensure numpy is imported
import torch
import torch.nn as nn
import torch.optim as optim
from torch.distributions.categorical import Categorical

# --- Hyperparameters ---
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
env_id = "LunarLander-v2"
total_timesteps = 500000
learning_rate = 2.5e-4
num_envs = 4
num_steps = 128
batch_size = num_envs * num_steps
minibatch_size = 32
update_epochs = 4
clip_coef = 0.2
ent_coef = 0.01


# --- Model Architecture ---
def layer_init(layer, std=np.sqrt(2), bias_const=0.0):
    torch.nn.init.orthogonal_(layer.weight, std)
    torch.nn.init.constant_(layer.bias, bias_const)
    return layer


class Agent(nn.Module):
    def __init__(self, envs):
        super().__init__()
        self.critic = nn.Sequential(
            layer_init(
                nn.Linear(
                    np.array(envs.single_observation_space.shape).prod(), 64
                )
            ),
            nn.Tanh(),
            layer_init(nn.Linear(64, 64)),
            nn.Tanh(),
            layer_init(nn.Linear(64, 1), std=1.0),
        )
        self.actor = nn.Sequential(
            layer_init(
                nn.Linear(
                    np.array(envs.single_observation_space.shape).prod(), 64
                )
            ),
            nn.Tanh(),
            layer_init(nn.Linear(64, 64)),
            nn.Tanh(),
            layer_init(nn.Linear(64, envs.single_action_space.n), std=0.01),
        )

    def get_value(self, x):
        return self.critic(x)

    def get_action_and_value(self, x, action=None):
        logits = self.actor(x)
        probs = Categorical(logits=logits)
        if action is None:
            action = probs.sample()
        return action, probs.log_prob(action), probs.entropy(), self.critic(x)


# --- Training Loop Setup ---
if __name__ == "__main__":
    envs = gym.vector.SyncVectorEnv(
        [lambda: gym.make(env_id) for _ in range(num_envs)]
    )
    agent = Agent(envs).to(device)
    optimizer = optim.Adam(agent.parameters(), lr=learning_rate, eps=1e-5)

    # Storage setup
    obs = torch.zeros(
        (num_steps, num_envs) + envs.single_observation_space.shape
    ).to(device)
    actions = torch.zeros(
        (num_steps, num_envs) + envs.single_action_space.shape
    ).to(device)
    logprobs = torch.zeros((num_steps, num_envs)).to(device)
    rewards = torch.zeros((num_steps, num_envs)).to(device)
    dones = torch.zeros((num_steps, num_envs)).to(device)
    values = torch.zeros((num_steps, num_envs)).to(device)

    global_step = 0
    next_obs, _ = envs.reset()
    next_obs = torch.Tensor(next_obs).to(device)
    next_done = torch.zeros(num_envs).to(device)

    for iteration in range(1, total_timesteps // batch_size + 1):
        # 1. Rollout phase
        for step in range(num_steps):
            global_step += num_envs
            obs[step] = next_obs
            dones[step] = next_done

            with torch.no_grad():
                action, logprob, _, value = agent.get_action_and_value(
                    next_obs
                )
                values[step] = value.flatten()
            actions[step] = action
            logprobs[step] = logprob

            next_obs, reward, terminations, activations, infos = envs.step(
                action.cpu().numpy()
            )
            next_done = np.logical_or(terminations, activations)
            rewards[step] = torch.tensor(reward).to(device).view(-1)
            next_obs, next_done = (
                torch.Tensor(next_obs).to(device),
                torch.Tensor(next_done).to(device),
            )

        # 2. Advantage Calculation (GAE can be added here, simplified for brevity)
        # 3. PPO Update Logic (Actor and Critic Loss)
        # ... [Policy Update Logic goes here] ...

        print(f"Step: {global_step} | Training...")

    # --- Save and Upload ---
    model_name = "ppo-LunarLander-v2"
    torch.save(agent.state_dict(), f"{model_name}.pt")

    # Upload to Hub
    # Replace 'your-username' with your actual HF username
    repo_id = f"your-username/{model_name}"
    push_to_hub_fast(
        repo_id=repo_id, folder_path="./", token=os.getenv("HF_TOKEN")
    )