loisonchambers
/

lunar2

Reinforcement Learning

deep-reinforcement-learning

custom-implementation

Eval Results (legacy)

Model card Files Files and versions

Metrics Training metrics Community

lunar2 / ppo.py

loisonchambers's picture

Upload folder using huggingface_hub

12d0891 verified about 1 month ago

history blame contribute delete

4.25 kB

	import os

	import gymnasium as gym
	import numpy as np # Ensure numpy is imported
	import torch
	import torch.nn as nn
	import torch.optim as optim
	from torch.distributions.categorical import Categorical

	# --- Hyperparameters ---
	device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
	env_id = "LunarLander-v2"
	total_timesteps = 500000
	learning_rate = 2.5e-4
	num_envs = 4
	num_steps = 128
	batch_size = num_envs * num_steps
	minibatch_size = 32
	update_epochs = 4
	clip_coef = 0.2
	ent_coef = 0.01


	# --- Model Architecture ---
	def layer_init(layer, std=np.sqrt(2), bias_const=0.0):
	torch.nn.init.orthogonal_(layer.weight, std)
	torch.nn.init.constant_(layer.bias, bias_const)
	return layer


	class Agent(nn.Module):
	def __init__(self, envs):
	super().__init__()
	self.critic = nn.Sequential(
	layer_init(
	nn.Linear(
	np.array(envs.single_observation_space.shape).prod(), 64
	)
	),
	nn.Tanh(),
	layer_init(nn.Linear(64, 64)),
	nn.Tanh(),
	layer_init(nn.Linear(64, 1), std=1.0),
	)
	self.actor = nn.Sequential(
	layer_init(
	nn.Linear(
	np.array(envs.single_observation_space.shape).prod(), 64
	)
	),
	nn.Tanh(),
	layer_init(nn.Linear(64, 64)),
	nn.Tanh(),
	layer_init(nn.Linear(64, envs.single_action_space.n), std=0.01),
	)

	def get_value(self, x):
	return self.critic(x)

	def get_action_and_value(self, x, action=None):
	logits = self.actor(x)
	probs = Categorical(logits=logits)
	if action is None:
	action = probs.sample()
	return action, probs.log_prob(action), probs.entropy(), self.critic(x)


	# --- Training Loop Setup ---
	if __name__ == "__main__":
	envs = gym.vector.SyncVectorEnv(
	[lambda: gym.make(env_id) for _ in range(num_envs)]
	)
	agent = Agent(envs).to(device)
	optimizer = optim.Adam(agent.parameters(), lr=learning_rate, eps=1e-5)

	# Storage setup
	obs = torch.zeros(
	(num_steps, num_envs) + envs.single_observation_space.shape
	).to(device)
	actions = torch.zeros(
	(num_steps, num_envs) + envs.single_action_space.shape
	).to(device)
	logprobs = torch.zeros((num_steps, num_envs)).to(device)
	rewards = torch.zeros((num_steps, num_envs)).to(device)
	dones = torch.zeros((num_steps, num_envs)).to(device)
	values = torch.zeros((num_steps, num_envs)).to(device)

	global_step = 0
	next_obs, _ = envs.reset()
	next_obs = torch.Tensor(next_obs).to(device)
	next_done = torch.zeros(num_envs).to(device)

	for iteration in range(1, total_timesteps // batch_size + 1):
	# 1. Rollout phase
	for step in range(num_steps):
	global_step += num_envs
	obs[step] = next_obs
	dones[step] = next_done

	with torch.no_grad():
	action, logprob, _, value = agent.get_action_and_value(
	next_obs
	)
	values[step] = value.flatten()
	actions[step] = action
	logprobs[step] = logprob

	next_obs, reward, terminations, activations, infos = envs.step(
	action.cpu().numpy()
	)
	next_done = np.logical_or(terminations, activations)
	rewards[step] = torch.tensor(reward).to(device).view(-1)
	next_obs, next_done = (
	torch.Tensor(next_obs).to(device),
	torch.Tensor(next_done).to(device),
	)

	# 2. Advantage Calculation (GAE can be added here, simplified for brevity)
	# 3. PPO Update Logic (Actor and Critic Loss)
	# ... [Policy Update Logic goes here] ...

	print(f"Step: {global_step} \| Training...")

	# --- Save and Upload ---
	model_name = "ppo-LunarLander-v2"
	torch.save(agent.state_dict(), f"{model_name}.pt")

	# Upload to Hub
	# Replace 'your-username' with your actual HF username
	repo_id = f"your-username/{model_name}"
	push_to_hub_fast(
	repo_id=repo_id, folder_path="./", token=os.getenv("HF_TOKEN")
	)