import os import gymnasium as gym import numpy as np # Ensure numpy is imported import torch import torch.nn as nn import torch.optim as optim from torch.distributions.categorical import Categorical # --- Hyperparameters --- device = torch.device("cuda" if torch.cuda.is_available() else "cpu") env_id = "LunarLander-v2" total_timesteps = 500000 learning_rate = 2.5e-4 num_envs = 4 num_steps = 128 batch_size = num_envs * num_steps minibatch_size = 32 update_epochs = 4 clip_coef = 0.2 ent_coef = 0.01 # --- Model Architecture --- def layer_init(layer, std=np.sqrt(2), bias_const=0.0): torch.nn.init.orthogonal_(layer.weight, std) torch.nn.init.constant_(layer.bias, bias_const) return layer class Agent(nn.Module): def __init__(self, envs): super().__init__() self.critic = nn.Sequential( layer_init( nn.Linear( np.array(envs.single_observation_space.shape).prod(), 64 ) ), nn.Tanh(), layer_init(nn.Linear(64, 64)), nn.Tanh(), layer_init(nn.Linear(64, 1), std=1.0), ) self.actor = nn.Sequential( layer_init( nn.Linear( np.array(envs.single_observation_space.shape).prod(), 64 ) ), nn.Tanh(), layer_init(nn.Linear(64, 64)), nn.Tanh(), layer_init(nn.Linear(64, envs.single_action_space.n), std=0.01), ) def get_value(self, x): return self.critic(x) def get_action_and_value(self, x, action=None): logits = self.actor(x) probs = Categorical(logits=logits) if action is None: action = probs.sample() return action, probs.log_prob(action), probs.entropy(), self.critic(x) # --- Training Loop Setup --- if __name__ == "__main__": envs = gym.vector.SyncVectorEnv( [lambda: gym.make(env_id) for _ in range(num_envs)] ) agent = Agent(envs).to(device) optimizer = optim.Adam(agent.parameters(), lr=learning_rate, eps=1e-5) # Storage setup obs = torch.zeros( (num_steps, num_envs) + envs.single_observation_space.shape ).to(device) actions = torch.zeros( (num_steps, num_envs) + envs.single_action_space.shape ).to(device) logprobs = torch.zeros((num_steps, num_envs)).to(device) rewards = torch.zeros((num_steps, num_envs)).to(device) dones = torch.zeros((num_steps, num_envs)).to(device) values = torch.zeros((num_steps, num_envs)).to(device) global_step = 0 next_obs, _ = envs.reset() next_obs = torch.Tensor(next_obs).to(device) next_done = torch.zeros(num_envs).to(device) for iteration in range(1, total_timesteps // batch_size + 1): # 1. Rollout phase for step in range(num_steps): global_step += num_envs obs[step] = next_obs dones[step] = next_done with torch.no_grad(): action, logprob, _, value = agent.get_action_and_value( next_obs ) values[step] = value.flatten() actions[step] = action logprobs[step] = logprob next_obs, reward, terminations, activations, infos = envs.step( action.cpu().numpy() ) next_done = np.logical_or(terminations, activations) rewards[step] = torch.tensor(reward).to(device).view(-1) next_obs, next_done = ( torch.Tensor(next_obs).to(device), torch.Tensor(next_done).to(device), ) # 2. Advantage Calculation (GAE can be added here, simplified for brevity) # 3. PPO Update Logic (Actor and Critic Loss) # ... [Policy Update Logic goes here] ... print(f"Step: {global_step} | Training...") # --- Save and Upload --- model_name = "ppo-LunarLander-v2" torch.save(agent.state_dict(), f"{model_name}.pt") # Upload to Hub # Replace 'your-username' with your actual HF username repo_id = f"your-username/{model_name}" push_to_hub_fast( repo_id=repo_id, folder_path="./", token=os.getenv("HF_TOKEN") )