File size: 4,250 Bytes
12d0891
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
import os

import gymnasium as gym
import numpy as np  # Ensure numpy is imported
import torch
import torch.nn as nn
import torch.optim as optim
from torch.distributions.categorical import Categorical

# --- Hyperparameters ---
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
env_id = "LunarLander-v2"
total_timesteps = 500000
learning_rate = 2.5e-4
num_envs = 4
num_steps = 128
batch_size = num_envs * num_steps
minibatch_size = 32
update_epochs = 4
clip_coef = 0.2
ent_coef = 0.01


# --- Model Architecture ---
def layer_init(layer, std=np.sqrt(2), bias_const=0.0):
    torch.nn.init.orthogonal_(layer.weight, std)
    torch.nn.init.constant_(layer.bias, bias_const)
    return layer


class Agent(nn.Module):
    def __init__(self, envs):
        super().__init__()
        self.critic = nn.Sequential(
            layer_init(
                nn.Linear(
                    np.array(envs.single_observation_space.shape).prod(), 64
                )
            ),
            nn.Tanh(),
            layer_init(nn.Linear(64, 64)),
            nn.Tanh(),
            layer_init(nn.Linear(64, 1), std=1.0),
        )
        self.actor = nn.Sequential(
            layer_init(
                nn.Linear(
                    np.array(envs.single_observation_space.shape).prod(), 64
                )
            ),
            nn.Tanh(),
            layer_init(nn.Linear(64, 64)),
            nn.Tanh(),
            layer_init(nn.Linear(64, envs.single_action_space.n), std=0.01),
        )

    def get_value(self, x):
        return self.critic(x)

    def get_action_and_value(self, x, action=None):
        logits = self.actor(x)
        probs = Categorical(logits=logits)
        if action is None:
            action = probs.sample()
        return action, probs.log_prob(action), probs.entropy(), self.critic(x)


# --- Training Loop Setup ---
if __name__ == "__main__":
    envs = gym.vector.SyncVectorEnv(
        [lambda: gym.make(env_id) for _ in range(num_envs)]
    )
    agent = Agent(envs).to(device)
    optimizer = optim.Adam(agent.parameters(), lr=learning_rate, eps=1e-5)

    # Storage setup
    obs = torch.zeros(
        (num_steps, num_envs) + envs.single_observation_space.shape
    ).to(device)
    actions = torch.zeros(
        (num_steps, num_envs) + envs.single_action_space.shape
    ).to(device)
    logprobs = torch.zeros((num_steps, num_envs)).to(device)
    rewards = torch.zeros((num_steps, num_envs)).to(device)
    dones = torch.zeros((num_steps, num_envs)).to(device)
    values = torch.zeros((num_steps, num_envs)).to(device)

    global_step = 0
    next_obs, _ = envs.reset()
    next_obs = torch.Tensor(next_obs).to(device)
    next_done = torch.zeros(num_envs).to(device)

    for iteration in range(1, total_timesteps // batch_size + 1):
        # 1. Rollout phase
        for step in range(num_steps):
            global_step += num_envs
            obs[step] = next_obs
            dones[step] = next_done

            with torch.no_grad():
                action, logprob, _, value = agent.get_action_and_value(
                    next_obs
                )
                values[step] = value.flatten()
            actions[step] = action
            logprobs[step] = logprob

            next_obs, reward, terminations, activations, infos = envs.step(
                action.cpu().numpy()
            )
            next_done = np.logical_or(terminations, activations)
            rewards[step] = torch.tensor(reward).to(device).view(-1)
            next_obs, next_done = (
                torch.Tensor(next_obs).to(device),
                torch.Tensor(next_done).to(device),
            )

        # 2. Advantage Calculation (GAE can be added here, simplified for brevity)
        # 3. PPO Update Logic (Actor and Critic Loss)
        # ... [Policy Update Logic goes here] ...

        print(f"Step: {global_step} | Training...")

    # --- Save and Upload ---
    model_name = "ppo-LunarLander-v2"
    torch.save(agent.state_dict(), f"{model_name}.pt")

    # Upload to Hub
    # Replace 'your-username' with your actual HF username
    repo_id = f"your-username/{model_name}"
    push_to_hub_fast(
        repo_id=repo_id, folder_path="./", token=os.getenv("HF_TOKEN")
    )