| | import json |
| | import datetime |
| |
|
| | import numpy as np |
| |
|
| | from collections import deque |
| |
|
| | |
| | import torch |
| | import torch.nn as nn |
| | import torch.nn.functional as F |
| | import torch.optim as optim |
| | from huggingface_hub import metadata_eval_result, HfApi, metadata_save |
| | from torch.distributions import Categorical |
| |
|
| | |
| | import gym |
| | import gym_pygame |
| |
|
| | device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") |
| | print(device) |
| |
|
| | env_id = "Pixelcopter-PLE-v0" |
| | env = gym.make(env_id) |
| | eval_env = gym.make(env_id) |
| | s_size = env.observation_space.shape[0] |
| | a_size = env.action_space.n |
| |
|
| |
|
| | class Policy(nn.Module): |
| | def __init__(self, s_size, a_size, h_size): |
| | super(Policy, self).__init__() |
| | self.fc1 = nn.Linear(s_size, h_size) |
| | self.fc2 = nn.Linear(h_size, h_size * 2) |
| | self.fc3 = nn.Linear(h_size * 2, a_size) |
| |
|
| | def forward(self, x): |
| | x = F.relu(self.fc1(x)) |
| | x = F.relu(self.fc2(x)) |
| | x = self.fc3(x) |
| | return F.softmax(x, dim=1) |
| |
|
| | def act(self, state): |
| | state = torch.from_numpy(state).float().unsqueeze(0).to(device) |
| | probs = self.forward(state).cpu() |
| | m = Categorical(probs) |
| | action = m.sample() |
| | return action.item(), m.log_prob(action) |
| |
|
| |
|
| | def reinforce(policy, optimizer, n_training_episodes, max_t, gamma, print_every): |
| | |
| | scores_deque = deque(maxlen=100) |
| | scores = [] |
| | |
| | for i_episode in range(1, n_training_episodes + 1): |
| | saved_log_probs = [] |
| | rewards = [] |
| | state = env.reset() |
| | |
| | for t in range(max_t): |
| | action, log_prob = policy.act(state) |
| | saved_log_probs.append(log_prob) |
| | state, reward, done, _ = env.step(action) |
| | rewards.append(reward) |
| | if done: |
| | break |
| | scores_deque.append(sum(rewards)) |
| | scores.append(sum(rewards)) |
| |
|
| | |
| | returns = deque(maxlen=max_t) |
| | n_steps = len(rewards) |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| |
|
| | |
| | |
| | |
| | |
| | |
| | |
| |
|
| | |
| | |
| |
|
| | |
| | |
| | |
| | |
| | |
| | |
| |
|
| | |
| | |
| | |
| | for t in range(n_steps)[::-1]: |
| | disc_return_t = (returns[0] if len(returns) > 0 else 0) |
| | returns.appendleft(gamma * disc_return_t + rewards[t]) |
| |
|
| | |
| | eps = np.finfo(np.float32).eps.item() |
| | |
| | |
| | returns = torch.tensor(returns) |
| | returns = (returns - returns.mean()) / (returns.std() + eps) |
| |
|
| | |
| | policy_loss = [] |
| | for log_prob, disc_return in zip(saved_log_probs, returns): |
| | policy_loss.append(-log_prob * disc_return) |
| | policy_loss = torch.cat(policy_loss).sum() |
| |
|
| | |
| | optimizer.zero_grad() |
| | policy_loss.backward() |
| | optimizer.step() |
| |
|
| | if i_episode % print_every == 0: |
| | print('Episode {}\tAverage Score: {:.2f}'.format(i_episode, np.mean(scores_deque))) |
| |
|
| | return scores |
| |
|
| |
|
| | pixelcopter_hyperparameters = { |
| | "h_size": 64, |
| | "n_training_episodes": 1000, |
| | "n_evaluation_episodes": 10, |
| | "max_t": 10000, |
| | "gamma": 0.99, |
| | "lr": 1e-4, |
| | "env_id": env_id, |
| | "state_space": s_size, |
| | "action_space": a_size, |
| | } |
| |
|
| | |
| | |
| | pixelcopter_policy = Policy(pixelcopter_hyperparameters["state_space"], pixelcopter_hyperparameters["action_space"], |
| | pixelcopter_hyperparameters["h_size"]).to(device) |
| | pixelcopter_optimizer = optim.Adam(pixelcopter_policy.parameters(), lr=pixelcopter_hyperparameters["lr"]) |
| |
|
| | scores = reinforce(pixelcopter_policy, |
| | pixelcopter_optimizer, |
| | pixelcopter_hyperparameters["n_training_episodes"], |
| | pixelcopter_hyperparameters["max_t"], |
| | pixelcopter_hyperparameters["gamma"], |
| | 1000) |
| |
|
| |
|
| | def push_to_hub(repo_id, |
| | model, |
| | hyperparameters, |
| | ): |
| | """ |
| | Evaluate, Generate a video and Upload a model to Hugging Face Hub. |
| | This method does the complete pipeline: |
| | - It evaluates the model |
| | - It generates the model card |
| | - It generates a replay video of the agent |
| | - It pushes everything to the Hub |
| | |
| | :param repo_id: repo_id: id of the model repository from the Hugging Face Hub |
| | :param model: the pytorch model we want to save |
| | :param hyperparameters: training hyperparameters |
| | :param eval_env: evaluation environment |
| | :param video_fps: how many frame per seconds to record our video replay |
| | """ |
| |
|
| | _, repo_name = repo_id.split("/") |
| | api = HfApi() |
| |
|
| | |
| | repo_url = api.create_repo( |
| | repo_id=repo_id, |
| | exist_ok=True, |
| | ) |
| |
|
| | |
| | torch.save(model, "model.pt") |
| |
|
| | |
| | with open("hyperparameters.json", "w") as outfile: |
| | json.dump(hyperparameters, outfile) |
| |
|
| | |
| | mean_reward, std_reward = 5.03, 0 |
| | |
| | eval_datetime = datetime.datetime.now() |
| | eval_form_datetime = eval_datetime.isoformat() |
| |
|
| | evaluate_data = { |
| | "env_id": hyperparameters["env_id"], |
| | "mean_reward": mean_reward, |
| | "n_evaluation_episodes": hyperparameters["n_evaluation_episodes"], |
| | "eval_datetime": eval_form_datetime, |
| | } |
| |
|
| | |
| | with open("results.json", "w") as outfile: |
| | json.dump(evaluate_data, outfile) |
| |
|
| | |
| | env_name = hyperparameters["env_id"] |
| |
|
| | metadata = {} |
| | metadata["tags"] = [ |
| | env_name, |
| | "reinforce", |
| | "reinforcement-learning", |
| | "custom-implementation", |
| | "deep-rl-class" |
| | ] |
| |
|
| | |
| | eval = metadata_eval_result( |
| | model_pretty_name=repo_name, |
| | task_pretty_name="reinforcement-learning", |
| | task_id="reinforcement-learning", |
| | metrics_pretty_name="mean_reward", |
| | metrics_id="mean_reward", |
| | metrics_value=f"{mean_reward:.2f} +/- {std_reward:.2f}", |
| | dataset_pretty_name=env_name, |
| | dataset_id=env_name, |
| | ) |
| |
|
| | |
| | metadata = {**metadata, **eval} |
| |
|
| | model_card = f""" |
| | # **Reinforce** Agent playing **{env_id}** |
| | This is a trained model of a **Reinforce** agent playing **{env_id}** . |
| | To learn to use this model and train yours check Unit 4 of the Deep Reinforcement Learning Course: https://huggingface.co/deep-rl-course/unit4/introduction |
| | """ |
| |
|
| | readme_path = "README.md" |
| | readme = model_card |
| | with open(readme_path, "w", encoding="utf-8") as f: |
| | f.write(readme) |
| |
|
| | |
| | metadata_save(readme_path, metadata) |
| |
|
| | |
| | api.upload_folder( |
| | repo_id=repo_id, |
| | folder_path=".", |
| | path_in_repo=".", |
| | ) |
| |
|
| | print(f"Your model is pushed to the Hub. You can view your model here: {repo_url}") |
| |
|
| |
|
| | repo_id = "cyrodw/Reinforce-Pixelcopter" |
| | push_to_hub(repo_id, |
| | pixelcopter_policy, |
| | pixelcopter_hyperparameters, |
| | ) |
| |
|