bkhan2000 commited on
Commit
27b8491
·
1 Parent(s): 3c7abae

Push agent to the Hub

Browse files
README.md ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ tags:
3
+ - LunarLander-v2
4
+ - ppo
5
+ - deep-reinforcement-learning
6
+ - reinforcement-learning
7
+ - custom-implementation
8
+ - deep-rl-course
9
+ model-index:
10
+ - name: PPO
11
+ results:
12
+ - task:
13
+ type: reinforcement-learning
14
+ name: reinforcement-learning
15
+ dataset:
16
+ name: LunarLander-v2
17
+ type: LunarLander-v2
18
+ metrics:
19
+ - type: mean_reward
20
+ value: -138.99 +/- 59.23
21
+ name: mean_reward
22
+ verified: false
23
+ ---
24
+
25
+ # PPO Agent Playing LunarLander-v2
26
+
27
+ This is a trained model of a PPO agent playing LunarLander-v2.
28
+
29
+ # Hyperparameters
30
+
logs/events.out.tfevents.1679286687.hanbk-robotmecha.1078677.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4dc4fb89347b487650dd9d904494de9c6299279c7d54a6fb5271a0a04e09fb1c
3
+ size 111205
model.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4f59c8f9da66f962233a9a894cdc61ae834a79ac56cddc7ffcc6239da74a3500
3
+ size 42817
ppo.py ADDED
@@ -0,0 +1,589 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+ import os
3
+ import random
4
+ import time
5
+ from distutils.util import strtobool
6
+
7
+ import gym
8
+ import numpy as np
9
+ import torch
10
+ import torch.nn as nn
11
+ import torch.optim as optim
12
+ from torch.distributions.categorical import Categorical
13
+ from torch.utils.tensorboard import SummaryWriter
14
+
15
+ from huggingface_hub import HfApi, upload_folder
16
+ from huggingface_hub.repocard import metadata_eval_result, metadata_save
17
+
18
+ from pathlib import Path
19
+ import datetime
20
+ import tempfile
21
+ import json
22
+ import shutil
23
+ import imageio
24
+
25
+ from wasabi import Printer
26
+
27
+ msg = Printer()
28
+
29
+
30
+ def parse_args():
31
+ # fmt: off
32
+ parser = argparse.ArgumentParser()
33
+ parser.add_argument("--exp-name", type=str, default=os.path.basename(__file__).rstrip(".py"),
34
+ help="the name of this experiment")
35
+ parser.add_argument("--seed", type=int, default=1,
36
+ help="seed of the experiment")
37
+ parser.add_argument("--torch-deterministic", type=lambda x: bool(strtobool(x)), default=True, nargs="?", const=True,
38
+ help="if toggled, `torch.backends.cudnn.deterministic=False`")
39
+ parser.add_argument("--cuda", type=lambda x: bool(strtobool(x)), default=True, nargs="?", const=True,
40
+ help="if toggled, cuda will be enabled by default")
41
+ parser.add_argument("--track", type=lambda x: bool(strtobool(x)), default=False, nargs="?", const=True,
42
+ help="if toggled, this experiment will be tracked with Weights and Biases")
43
+ parser.add_argument("--wandb-project-name", type=str, default="cleanRL",
44
+ help="the wandb's project name")
45
+ parser.add_argument("--wandb-entity", type=str, default=None,
46
+ help="the entity (team) of wandb's project")
47
+ parser.add_argument("--capture-video", type=lambda x: bool(strtobool(x)), default=False, nargs="?", const=True,
48
+ help="weather to capture videos of the agent performances (check out `videos` folder)")
49
+
50
+ # Algorithm specific arguments
51
+ parser.add_argument("--env-id", type=str, default="CartPole-v1",
52
+ help="the id of the environment")
53
+ parser.add_argument("--total-timesteps", type=int, default=50000,
54
+ help="total timesteps of the experiments")
55
+ parser.add_argument("--learning-rate", type=float, default=2.5e-4,
56
+ help="the learning rate of the optimizer")
57
+ parser.add_argument("--num-envs", type=int, default=4,
58
+ help="the number of parallel game environments")
59
+ parser.add_argument("--num-steps", type=int, default=128,
60
+ help="the number of steps to run in each environment per policy rollout")
61
+ parser.add_argument("--anneal-lr", type=lambda x: bool(strtobool(x)), default=True, nargs="?", const=True,
62
+ help="Toggle learning rate annealing for policy and value networks")
63
+ parser.add_argument("--gae", type=lambda x: bool(strtobool(x)), default=True, nargs="?", const=True,
64
+ help="Use GAE for advantage computation")
65
+ parser.add_argument("--gamma", type=float, default=0.99,
66
+ help="the discount factor gamma")
67
+ parser.add_argument("--gae-lambda", type=float, default=0.95,
68
+ help="the lambda for the general advantage estimation")
69
+ parser.add_argument("--num-minibatches", type=int, default=4,
70
+ help="the number of mini-batches")
71
+ parser.add_argument("--update-epochs", type=int, default=4,
72
+ help="the K epochs to update the policy")
73
+ parser.add_argument("--norm-adv", type=lambda x: bool(strtobool(x)), default=True, nargs="?", const=True,
74
+ help="Toggles advantages normalization")
75
+ parser.add_argument("--clip-coef", type=float, default=0.2,
76
+ help="the surrogate clipping coefficient")
77
+ parser.add_argument("--clip-vloss", type=lambda x: bool(strtobool(x)), default=True, nargs="?", const=True,
78
+ help="Toggles whether or not to use a clipped loss for the value function, as per the paper.")
79
+ parser.add_argument("--ent-coef", type=float, default=0.01,
80
+ help="coefficient of the entropy")
81
+ parser.add_argument("--vf-coef", type=float, default=0.5,
82
+ help="coefficient of the value function")
83
+ parser.add_argument("--max-grad-norm", type=float, default=0.5,
84
+ help="the maximum norm for the gradient clipping")
85
+ parser.add_argument("--target-kl", type=float, default=None,
86
+ help="the target KL divergence threshold")
87
+
88
+ # Adding HuggingFace argument
89
+ parser.add_argument("--repo-id", type=str, default="bkhan2000/LunaLander-v2", help="id of the model repository from the Hugging Face Hub {username/repo_name}")
90
+
91
+ args = parser.parse_args()
92
+ args.batch_size = int(args.num_envs * args.num_steps)
93
+ args.minibatch_size = int(args.batch_size // args.num_minibatches)
94
+ # fmt: on
95
+ return args
96
+
97
+ def package_to_hub(
98
+ repo_id,
99
+ model,
100
+ hyperparameters,
101
+ eval_env,
102
+ video_fps=30,
103
+ commit_message="Push agent to the Hub",
104
+ token=None,
105
+ logs=None,
106
+ ):
107
+ """
108
+ Evaluate, Generate a video and Upload a model to Hugging Face Hub.
109
+ This method does the complete pipeline:
110
+ - It evaluates the model
111
+ - It generates the model card
112
+ - It generates a replay video of the agent
113
+ - It pushes everything to the hub
114
+ :param repo_id: id of the model repository from the Hugging Face Hub
115
+ :param model: trained model
116
+ :param eval_env: environment used to evaluate the agent
117
+ :param fps: number of fps for rendering the video
118
+ :param commit_message: commit message
119
+ :param logs: directory on local machine of tensorboard logs you'd like to upload
120
+ """
121
+ msg.info(
122
+ "This function will save, evaluate, generate a video of your agent, "
123
+ "create a model card and push everything to the hub. "
124
+ "It might take up to 1min. \n "
125
+ "This is a work in progress: if you encounter a bug, please open an issue."
126
+ )
127
+ # Step 1: Clone or create the repo
128
+ repo_url = HfApi().create_repo(
129
+ repo_id=repo_id,
130
+ token=token,
131
+ private=False,
132
+ exist_ok=True,
133
+ )
134
+
135
+ with tempfile.TemporaryDirectory() as tmpdirname:
136
+ tmpdirname = Path("./")
137
+
138
+ # Step 2: Save the model
139
+ torch.save(model.state_dict(), tmpdirname / "model.pt")
140
+
141
+ # Step 3: Evaluate the model and build JSON
142
+ mean_reward, std_reward = _evaluate_agent(eval_env, 10, model)
143
+
144
+ # First get datetime
145
+ eval_datetime = datetime.datetime.now()
146
+ eval_form_datetime = eval_datetime.isoformat()
147
+
148
+ evaluate_data = {
149
+ "env_id": hyperparameters.env_id,
150
+ "mean_reward": mean_reward,
151
+ "std_reward": std_reward,
152
+ "n_evaluation_episodes": 10,
153
+ "eval_datetime": eval_form_datetime,
154
+ }
155
+
156
+ # Write a JSON file
157
+ with open(tmpdirname / "results.json", "w") as outfile:
158
+ json.dump(evaluate_data, outfile)
159
+
160
+ # Step 4: Generate a video
161
+ video_path = tmpdirname / "replay.mp4"
162
+ record_video(eval_env, model, video_path, video_fps)
163
+
164
+ # Step 5: Generate the model card
165
+ generated_model_card, metadata = _generate_model_card(
166
+ "PPO", hyperparameters.env_id, mean_reward, std_reward, hyperparameters
167
+ )
168
+ _save_model_card(tmpdirname, generated_model_card, metadata)
169
+
170
+ # Step 6: Add logs if needed
171
+ if logs:
172
+ _add_logdir(tmpdirname, Path(logs))
173
+
174
+ msg.info(f"Pushing repo {repo_id} to the Hugging Face Hub")
175
+
176
+ repo_url = upload_folder(
177
+ repo_id=repo_id,
178
+ folder_path=tmpdirname,
179
+ path_in_repo="",
180
+ commit_message=commit_message,
181
+ token=token,
182
+ )
183
+
184
+ msg.info(f"Your model is pushed to the Hub. You can view your model here: {repo_url}")
185
+ return repo_url
186
+
187
+ def _evaluate_agent(env, n_eval_episodes, policy):
188
+ """
189
+ Evaluate the agent for ``n_eval_episodes`` episodes and returns average reward and std of reward.
190
+ :param env: The evaluation environment
191
+ :param n_eval_episodes: Number of episode to evaluate the agent
192
+ :param policy: The agent
193
+ """
194
+ episode_rewards = []
195
+ for episode in range(n_eval_episodes):
196
+ state = env.reset()
197
+ step = 0
198
+ done = False
199
+ total_rewards_ep = 0
200
+
201
+ while done is False:
202
+ state = torch.Tensor(state).to(device)
203
+ action, _, _, _ = policy.get_action_and_value(state)
204
+ new_state, reward, done, info = env.step(action.cpu().numpy())
205
+ total_rewards_ep += reward
206
+ if done:
207
+ break
208
+ state = new_state
209
+ episode_rewards.append(total_rewards_ep)
210
+ mean_reward = np.mean(episode_rewards)
211
+ std_reward = np.std(episode_rewards)
212
+
213
+ return mean_reward, std_reward
214
+
215
+
216
+ def record_video(env, policy, out_directory, fps=30):
217
+ images = []
218
+ done = False
219
+ state = env.reset()
220
+ img = env.render(mode="rgb_array")
221
+ images.append(img)
222
+ while not done:
223
+ state = torch.Tensor(state).to(device)
224
+ # Take the action (index) that have the maximum expected future reward given that state
225
+ action, _, _, _ = policy.get_action_and_value(state)
226
+ state, reward, done, info = env.step(
227
+ action.cpu().numpy()
228
+ ) # We directly put next_state = state for recording logic
229
+ img = env.render(mode="rgb_array")
230
+ images.append(img)
231
+ imageio.mimsave(out_directory, [np.array(img) for i, img in enumerate(images)], fps=fps)
232
+
233
+
234
+ def _generate_model_card(model_name, env_id, mean_reward, std_reward, hyperparameters):
235
+ """
236
+ Generate the model card for the Hub
237
+ :param model_name: name of the model
238
+ :env_id: name of the environment
239
+ :mean_reward: mean reward of the agent
240
+ :std_reward: standard deviation of the mean reward of the agent
241
+ :hyperparameters: training arguments
242
+ """
243
+ # Step 1: Select the tags
244
+ metadata = generate_metadata(model_name, env_id, mean_reward, std_reward)
245
+
246
+ # Transform the hyperparams namespace to string
247
+ converted_dict = vars(hyperparameters)
248
+ converted_str = str(converted_dict)
249
+ converted_str = converted_str.split(", ")
250
+ converted_str = "\n".join(converted_str)
251
+
252
+ # Step 2: Generate the model card
253
+ model_card = f"""
254
+ # PPO Agent Playing {env_id}
255
+
256
+ This is a trained model of a PPO agent playing {env_id}.
257
+
258
+ # Hyperparameters
259
+ """
260
+ return model_card, metadata
261
+
262
+
263
+ def generate_metadata(model_name, env_id, mean_reward, std_reward):
264
+ """
265
+ Define the tags for the model card
266
+ :param model_name: name of the model
267
+ :param env_id: name of the environment
268
+ :mean_reward: mean reward of the agent
269
+ :std_reward: standard deviation of the mean reward of the agent
270
+ """
271
+ metadata = {}
272
+ metadata["tags"] = [
273
+ env_id,
274
+ "ppo",
275
+ "deep-reinforcement-learning",
276
+ "reinforcement-learning",
277
+ "custom-implementation",
278
+ "deep-rl-course",
279
+ ]
280
+
281
+ # Add metrics
282
+ eval = metadata_eval_result(
283
+ model_pretty_name=model_name,
284
+ task_pretty_name="reinforcement-learning",
285
+ task_id="reinforcement-learning",
286
+ metrics_pretty_name="mean_reward",
287
+ metrics_id="mean_reward",
288
+ metrics_value=f"{mean_reward:.2f} +/- {std_reward:.2f}",
289
+ dataset_pretty_name=env_id,
290
+ dataset_id=env_id,
291
+ )
292
+
293
+ # Merges both dictionaries
294
+ metadata = {**metadata, **eval}
295
+
296
+ return metadata
297
+
298
+
299
+ def _save_model_card(local_path, generated_model_card, metadata):
300
+ """Saves a model card for the repository.
301
+ :param local_path: repository directory
302
+ :param generated_model_card: model card generated by _generate_model_card()
303
+ :param metadata: metadata
304
+ """
305
+ readme_path = local_path / "README.md"
306
+ readme = ""
307
+ if readme_path.exists():
308
+ with readme_path.open("r", encoding="utf8") as f:
309
+ readme = f.read()
310
+ else:
311
+ readme = generated_model_card
312
+
313
+ with readme_path.open("w", encoding="utf-8") as f:
314
+ f.write(readme)
315
+
316
+ # Save our metrics to Readme metadata
317
+ metadata_save(readme_path, metadata)
318
+
319
+
320
+ def _add_logdir(local_path: Path, logdir: Path):
321
+ """Adds a logdir to the repository.
322
+ :param local_path: repository directory
323
+ :param logdir: logdir directory
324
+ """
325
+ if logdir.exists() and logdir.is_dir():
326
+ # Add the logdir to the repository under new dir called logs
327
+ repo_logdir = local_path / "logs"
328
+
329
+ # Delete current logs if they exist
330
+ if repo_logdir.exists():
331
+ shutil.rmtree(repo_logdir)
332
+
333
+ # Copy logdir into repo logdir
334
+ shutil.copytree(logdir, repo_logdir)
335
+
336
+
337
+ def make_env(env_id, seed, idx, capture_video, run_name):
338
+ def thunk():
339
+ env = gym.make(env_id)
340
+ env = gym.wrappers.RecordEpisodeStatistics(env)
341
+ if capture_video:
342
+ if idx == 0:
343
+ env = gym.wrappers.RecordVideo(env, f"videos/{run_name}")
344
+ env.seed(seed)
345
+ env.action_space.seed(seed)
346
+ env.observation_space.seed(seed)
347
+ return env
348
+
349
+ return thunk
350
+
351
+ def layer_init(layer, std=np.sqrt(2), bias_const=0.0):
352
+ torch.nn.init.orthogonal_(layer.weight, std)
353
+ torch.nn.init.constant_(layer.bias, bias_const)
354
+ return layer
355
+
356
+ class Agent(nn.Module):
357
+ def __init__(self, envs):
358
+ super().__init__()
359
+ self.critic = nn.Sequential(
360
+ layer_init(nn.Linear(np.array(envs.single_observation_space.shape).prod(), 64)),
361
+ nn.Tanh(),
362
+ layer_init(nn.Linear(64, 64)),
363
+ nn.Tanh(),
364
+ layer_init(nn.Linear(64, 1), std=1.0),
365
+ )
366
+ self.actor = nn.Sequential(
367
+ layer_init(nn.Linear(np.array(envs.single_observation_space.shape).prod(), 64)),
368
+ nn.Tanh(),
369
+ layer_init(nn.Linear(64, 64)),
370
+ nn.Tanh(),
371
+ layer_init(nn.Linear(64, envs.single_action_space.n), std=0.01),
372
+ )
373
+
374
+ def get_value(self, x):
375
+ return self.critic(x)
376
+
377
+ def get_action_and_value(self, x, action=None):
378
+ logits = self.actor(x)
379
+ probs = Categorical(logits=logits)
380
+ if action is None:
381
+ action = probs.sample()
382
+ return action, probs.log_prob(action), probs.entropy(), self.critic(x)
383
+
384
+ if __name__ == "__main__":
385
+ args = parse_args()
386
+ run_name = f"{args.env_id}__{args.exp_name}__{args.seed}__{int(time.time())}"
387
+ if args.track:
388
+ import wandb
389
+
390
+ wandb.init(
391
+ project=args.wandb_project_name,
392
+ entity=args.wandb_entity,
393
+ sync_tensorboard=True,
394
+ config=vars(args),
395
+ name=run_name,
396
+ monitor_gym=True,
397
+ save_code=True,
398
+ )
399
+ writer = SummaryWriter(f"runs/{run_name}")
400
+ writer.add_text(
401
+ "hyperparameters",
402
+ "|param|value|\n|-|-|\n%s" % ("\n".join([f"|{key}|{value}|" for key, value in vars(args).items()])),
403
+ )
404
+
405
+ # TRY NOT TO MODIFY: seeding
406
+ random.seed(args.seed)
407
+ np.random.seed(args.seed)
408
+ torch.manual_seed(args.seed)
409
+ torch.backends.cudnn.deterministic = args.torch_deterministic
410
+
411
+ device = torch.device("cuda" if torch.cuda.is_available() and args.cuda else "cpu")
412
+
413
+ # env setup
414
+ envs = gym.vector.SyncVectorEnv(
415
+ [make_env(args.env_id, args.seed + i, i, args.capture_video, run_name) for i in range(args.num_envs)]
416
+ )
417
+ assert isinstance(envs.single_action_space, gym.spaces.Discrete), "only discrete action space is supported"
418
+
419
+ agent = Agent(envs).to(device)
420
+ optimizer = optim.Adam(agent.parameters(), lr=args.learning_rate, eps=1e-5)
421
+
422
+ # ALGO Logic: Storage setup
423
+ obs = torch.zeros((args.num_steps, args.num_envs) + envs.single_observation_space.shape).to(device)
424
+ actions = torch.zeros((args.num_steps, args.num_envs) + envs.single_action_space.shape).to(device)
425
+ logprobs = torch.zeros((args.num_steps, args.num_envs)).to(device)
426
+ rewards = torch.zeros((args.num_steps, args.num_envs)).to(device)
427
+ dones = torch.zeros((args.num_steps, args.num_envs)).to(device)
428
+ values = torch.zeros((args.num_steps, args.num_envs)).to(device)
429
+
430
+ # TRY NOT TO MODIFY: start the game
431
+ global_step = 0
432
+ start_time = time.time()
433
+ next_obs = torch.Tensor(envs.reset()).to(device)
434
+ next_done = torch.zeros(args.num_envs).to(device)
435
+ num_updates = args.total_timesteps // args.batch_size
436
+
437
+ for update in range(1, num_updates + 1):
438
+ # Annealing the rate if instructed to do so.
439
+ if args.anneal_lr:
440
+ frac = 1.0 - (update - 1.0) / num_updates
441
+ lrnow = frac * args.learning_rate
442
+ optimizer.param_groups[0]["lr"] = lrnow
443
+
444
+ for step in range(0, args.num_steps):
445
+ global_step += 1 * args.num_envs
446
+ obs[step] = next_obs
447
+ dones[step] = next_done
448
+
449
+ # ALGO LOGIC: action logic
450
+ with torch.no_grad():
451
+ action, logprob, _, value = agent.get_action_and_value(next_obs)
452
+ values[step] = value.flatten()
453
+ actions[step] = action
454
+ logprobs[step] = logprob
455
+
456
+ # TRY NOT TO MODIFY: execute the game and log data.
457
+ next_obs, reward, done, info = envs.step(action.cpu().numpy())
458
+ rewards[step] = torch.tensor(reward).to(device).view(-1)
459
+ next_obs, next_done = torch.Tensor(next_obs).to(device), torch.Tensor(done).to(device)
460
+
461
+ for item in info:
462
+ if "episode" in item.keys():
463
+ print(f"global_step={global_step}, episodic_return={item['episode']['r']}")
464
+ writer.add_scalar("charts/episodic_return", item["episode"]["r"], global_step)
465
+ writer.add_scalar("charts/episodic_length", item["episode"]["l"], global_step)
466
+ break
467
+
468
+ # bootstrap value if not done
469
+ with torch.no_grad():
470
+ next_value = agent.get_value(next_obs).reshape(1, -1)
471
+ if args.gae:
472
+ advantages = torch.zeros_like(rewards).to(device)
473
+ lastgaelam = 0
474
+ for t in reversed(range(args.num_steps)):
475
+ if t == args.num_steps - 1:
476
+ nextnonterminal = 1.0 - next_done
477
+ nextvalues = next_value
478
+ else:
479
+ nextnonterminal = 1.0 - dones[t + 1]
480
+ nextvalues = values[t + 1]
481
+ delta = rewards[t] + args.gamma * nextvalues * nextnonterminal - values[t]
482
+ advantages[t] = lastgaelam = delta + args.gamma * args.gae_lambda * nextnonterminal * lastgaelam
483
+ returns = advantages + values
484
+ else:
485
+ returns = torch.zeros_like(rewards).to(device)
486
+ for t in reversed(range(args.num_steps)):
487
+ if t == args.num_steps - 1:
488
+ nextnonterminal = 1.0 - next_done
489
+ next_return = next_value
490
+ else:
491
+ nextnonterminal = 1.0 - dones[t + 1]
492
+ next_return = returns[t + 1]
493
+ returns[t] = rewards[t] + args.gamma * nextnonterminal * next_return
494
+ advantages = returns - values
495
+
496
+ # flatten the batch
497
+ b_obs = obs.reshape((-1,) + envs.single_observation_space.shape)
498
+ b_logprobs = logprobs.reshape(-1)
499
+ b_actions = actions.reshape((-1,) + envs.single_action_space.shape)
500
+ b_advantages = advantages.reshape(-1)
501
+ b_returns = returns.reshape(-1)
502
+ b_values = values.reshape(-1)
503
+
504
+ # Optimizing the policy and value network
505
+ b_inds = np.arange(args.batch_size)
506
+ clipfracs = []
507
+ for epoch in range(args.update_epochs):
508
+ np.random.shuffle(b_inds)
509
+ for start in range(0, args.batch_size, args.minibatch_size):
510
+ end = start + args.minibatch_size
511
+ mb_inds = b_inds[start:end]
512
+
513
+ _, newlogprob, entropy, newvalue = agent.get_action_and_value(
514
+ b_obs[mb_inds], b_actions.long()[mb_inds]
515
+ )
516
+ logratio = newlogprob - b_logprobs[mb_inds]
517
+ ratio = logratio.exp()
518
+
519
+ with torch.no_grad():
520
+ # calculate approx_kl http://joschu.net/blog/kl-approx.html
521
+ old_approx_kl = (-logratio).mean()
522
+ approx_kl = ((ratio - 1) - logratio).mean()
523
+ clipfracs += [((ratio - 1.0).abs() > args.clip_coef).float().mean().item()]
524
+
525
+ mb_advantages = b_advantages[mb_inds]
526
+ if args.norm_adv:
527
+ mb_advantages = (mb_advantages - mb_advantages.mean()) / (mb_advantages.std() + 1e-8)
528
+
529
+ # Policy loss
530
+ pg_loss1 = -mb_advantages * ratio
531
+ pg_loss2 = -mb_advantages * torch.clamp(ratio, 1 - args.clip_coef, 1 + args.clip_coef)
532
+ pg_loss = torch.max(pg_loss1, pg_loss2).mean()
533
+
534
+ # Value loss
535
+ newvalue = newvalue.view(-1)
536
+ if args.clip_vloss:
537
+ v_loss_unclipped = (newvalue - b_returns[mb_inds]) ** 2
538
+ v_clipped = b_values[mb_inds] + torch.clamp(
539
+ newvalue - b_values[mb_inds],
540
+ -args.clip_coef,
541
+ args.clip_coef,
542
+ )
543
+ v_loss_clipped = (v_clipped - b_returns[mb_inds]) ** 2
544
+ v_loss_max = torch.max(v_loss_unclipped, v_loss_clipped)
545
+ v_loss = 0.5 * v_loss_max.mean()
546
+ else:
547
+ v_loss = 0.5 * ((newvalue - b_returns[mb_inds]) ** 2).mean()
548
+
549
+ entropy_loss = entropy.mean()
550
+ loss = pg_loss - args.ent_coef * entropy_loss + v_loss * args.vf_coef
551
+
552
+ optimizer.zero_grad()
553
+ loss.backward()
554
+ nn.utils.clip_grad_norm_(agent.parameters(), args.max_grad_norm)
555
+ optimizer.step()
556
+
557
+ if args.target_kl is not None:
558
+ if approx_kl > args.target_kl:
559
+ break
560
+
561
+ y_pred, y_true = b_values.cpu().numpy(), b_returns.cpu().numpy()
562
+ var_y = np.var(y_true)
563
+ explained_var = np.nan if var_y == 0 else 1 - np.var(y_true - y_pred) / var_y
564
+
565
+ # TRY NOT TO MODIFY: record rewards for plotting purposes
566
+ writer.add_scalar("charts/learning_rate", optimizer.param_groups[0]["lr"], global_step)
567
+ writer.add_scalar("losses/value_loss", v_loss.item(), global_step)
568
+ writer.add_scalar("losses/policy_loss", pg_loss.item(), global_step)
569
+ writer.add_scalar("losses/entropy", entropy_loss.item(), global_step)
570
+ writer.add_scalar("losses/old_approx_kl", old_approx_kl.item(), global_step)
571
+ writer.add_scalar("losses/approx_kl", approx_kl.item(), global_step)
572
+ writer.add_scalar("losses/clipfrac", np.mean(clipfracs), global_step)
573
+ writer.add_scalar("losses/explained_variance", explained_var, global_step)
574
+ print("SPS:", int(global_step / (time.time() - start_time)))
575
+ writer.add_scalar("charts/SPS", int(global_step / (time.time() - start_time)), global_step)
576
+
577
+ envs.close()
578
+ writer.close()
579
+
580
+ # Create the evaluation environment
581
+ eval_env = gym.make(args.env_id)
582
+
583
+ package_to_hub(
584
+ repo_id=args.repo_id,
585
+ model=agent, # The model we want to save
586
+ hyperparameters=args,
587
+ eval_env=gym.make(args.env_id),
588
+ logs=f"runs/{run_name}",
589
+ )
ppo_cleanRL.ipynb ADDED
@@ -0,0 +1,272 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 1,
6
+ "metadata": {},
7
+ "outputs": [
8
+ {
9
+ "name": "stderr",
10
+ "output_type": "stream",
11
+ "text": [
12
+ "/home/hanbk/torch_venv/lib/python3.8/site-packages/IPython/core/display.py:419: UserWarning: Consider using IPython.display.IFrame instead\n",
13
+ " warnings.warn(\"Consider using IPython.display.IFrame instead\")\n"
14
+ ]
15
+ },
16
+ {
17
+ "data": {
18
+ "text/html": [
19
+ "<iframe width=\"560\" height=\"315\" src=\"https://www.youtube.com/embed/MEt6rrxH8W4\" title=\"YouTube video player\" frameborder=\"0\" allow=\"accelerometer; autoplay; clipboard-write; encrypted-media; gyroscope; picture-in-picture\" allowfullscreen></iframe>"
20
+ ],
21
+ "text/plain": [
22
+ "<IPython.core.display.HTML object>"
23
+ ]
24
+ },
25
+ "execution_count": 1,
26
+ "metadata": {},
27
+ "output_type": "execute_result"
28
+ }
29
+ ],
30
+ "source": [
31
+ "from IPython.display import HTML\n",
32
+ "\n",
33
+ "HTML(\n",
34
+ " '<iframe width=\"560\" height=\"315\" src=\"https://www.youtube.com/embed/MEt6rrxH8W4\" title=\"YouTube video player\" frameborder=\"0\" allow=\"accelerometer; autoplay; clipboard-write; encrypted-media; gyroscope; picture-in-picture\" allowfullscreen></iframe>'\n",
35
+ ")"
36
+ ]
37
+ },
38
+ {
39
+ "cell_type": "code",
40
+ "execution_count": 3,
41
+ "metadata": {},
42
+ "outputs": [],
43
+ "source": [
44
+ "import argparse\n",
45
+ "import os\n",
46
+ "import random\n",
47
+ "import time\n",
48
+ "from distutils.util import strtobool\n",
49
+ "\n",
50
+ "import gym\n",
51
+ "import numpy as np\n",
52
+ "import torch\n",
53
+ "import torch.nn as nn\n",
54
+ "import torch.optim as optim\n",
55
+ "from torch.distributions.categorical import Categorical\n",
56
+ "from torch.utils.tensorboard import SummaryWriter\n",
57
+ "\n",
58
+ "from huggingface_hub import HfApi, upload_folder\n",
59
+ "from huggingface_hub.repocard import metadata_eval_result, metadata_save\n",
60
+ "\n",
61
+ "from pathlib import Path\n",
62
+ "import datetime\n",
63
+ "import tempfile\n",
64
+ "import json\n",
65
+ "import shutil\n",
66
+ "import imageio\n",
67
+ "\n",
68
+ "from wasabi import Printer\n",
69
+ "\n",
70
+ "msg = Printer()"
71
+ ]
72
+ },
73
+ {
74
+ "cell_type": "code",
75
+ "execution_count": 4,
76
+ "metadata": {},
77
+ "outputs": [],
78
+ "source": [
79
+ "def parse_args():\n",
80
+ " # fmt: off\n",
81
+ " parser = argparse.ArgumentParser()\n",
82
+ " parser.add_argument(\"--exp-name\", type=str, default=os.path.basename(__file__).rstrip(\".py\"),\n",
83
+ " help=\"the name of this experiment\")\n",
84
+ " parser.add_argument(\"--seed\", type=int, default=1,\n",
85
+ " help=\"seed of the experiment\")\n",
86
+ " parser.add_argument(\"--torch-deterministic\", type=lambda x: bool(strtobool(x)), default=True, nargs=\"?\", const=True,\n",
87
+ " help=\"if toggled, `torch.backends.cudnn.deterministic=False`\")\n",
88
+ " parser.add_argument(\"--cuda\", type=lambda x: bool(strtobool(x)), default=True, nargs=\"?\", const=True,\n",
89
+ " help=\"if toggled, cuda will be enabled by default\")\n",
90
+ " parser.add_argument(\"--track\", type=lambda x: bool(strtobool(x)), default=False, nargs=\"?\", const=True,\n",
91
+ " help=\"if toggled, this experiment will be tracked with Weights and Biases\")\n",
92
+ " parser.add_argument(\"--wandb-project-name\", type=str, default=\"cleanRL\",\n",
93
+ " help=\"the wandb's project name\")\n",
94
+ " parser.add_argument(\"--wandb-entity\", type=str, default=None,\n",
95
+ " help=\"the entity (team) of wandb's project\")\n",
96
+ " parser.add_argument(\"--capture-video\", type=lambda x: bool(strtobool(x)), default=False, nargs=\"?\", const=True,\n",
97
+ " help=\"weather to capture videos of the agent performances (check out `videos` folder)\")\n",
98
+ "\n",
99
+ " # Algorithm specific arguments\n",
100
+ " parser.add_argument(\"--env-id\", type=str, default=\"CartPole-v1\",\n",
101
+ " help=\"the id of the environment\")\n",
102
+ " parser.add_argument(\"--total-timesteps\", type=int, default=50000,\n",
103
+ " help=\"total timesteps of the experiments\")\n",
104
+ " parser.add_argument(\"--learning-rate\", type=float, default=2.5e-4,\n",
105
+ " help=\"the learning rate of the optimizer\")\n",
106
+ " parser.add_argument(\"--num-envs\", type=int, default=4,\n",
107
+ " help=\"the number of parallel game environments\")\n",
108
+ " parser.add_argument(\"--num-steps\", type=int, default=128,\n",
109
+ " help=\"the number of steps to run in each environment per policy rollout\")\n",
110
+ " parser.add_argument(\"--anneal-lr\", type=lambda x: bool(strtobool(x)), default=True, nargs=\"?\", const=True,\n",
111
+ " help=\"Toggle learning rate annealing for policy and value networks\")\n",
112
+ " parser.add_argument(\"--gae\", type=lambda x: bool(strtobool(x)), default=True, nargs=\"?\", const=True,\n",
113
+ " help=\"Use GAE for advantage computation\")\n",
114
+ " parser.add_argument(\"--gamma\", type=float, default=0.99,\n",
115
+ " help=\"the discount factor gamma\")\n",
116
+ " parser.add_argument(\"--gae-lambda\", type=float, default=0.95,\n",
117
+ " help=\"the lambda for the general advantage estimation\")\n",
118
+ " parser.add_argument(\"--num-minibatches\", type=int, default=4,\n",
119
+ " help=\"the number of mini-batches\")\n",
120
+ " parser.add_argument(\"--update-epochs\", type=int, default=4,\n",
121
+ " help=\"the K epochs to update the policy\")\n",
122
+ " parser.add_argument(\"--norm-adv\", type=lambda x: bool(strtobool(x)), default=True, nargs=\"?\", const=True,\n",
123
+ " help=\"Toggles advantages normalization\")\n",
124
+ " parser.add_argument(\"--clip-coef\", type=float, default=0.2,\n",
125
+ " help=\"the surrogate clipping coefficient\")\n",
126
+ " parser.add_argument(\"--clip-vloss\", type=lambda x: bool(strtobool(x)), default=True, nargs=\"?\", const=True,\n",
127
+ " help=\"Toggles whether or not to use a clipped loss for the value function, as per the paper.\")\n",
128
+ " parser.add_argument(\"--ent-coef\", type=float, default=0.01,\n",
129
+ " help=\"coefficient of the entropy\")\n",
130
+ " parser.add_argument(\"--vf-coef\", type=float, default=0.5,\n",
131
+ " help=\"coefficient of the value function\")\n",
132
+ " parser.add_argument(\"--max-grad-norm\", type=float, default=0.5,\n",
133
+ " help=\"the maximum norm for the gradient clipping\")\n",
134
+ " parser.add_argument(\"--target-kl\", type=float, default=None,\n",
135
+ " help=\"the target KL divergence threshold\")\n",
136
+ "\n",
137
+ " # Adding HuggingFace argument\n",
138
+ " parser.add_argument(\"--repo-id\", type=str, default=\"ThomasSimonini/ppo-CartPole-v1\", help=\"id of the model repository from the Hugging Face Hub {username/repo_name}\")\n",
139
+ "\n",
140
+ " args = parser.parse_args()\n",
141
+ " args.batch_size = int(args.num_envs * args.num_steps)\n",
142
+ " args.minibatch_size = int(args.batch_size // args.num_minibatches)\n",
143
+ " # fmt: on\n",
144
+ " return args"
145
+ ]
146
+ },
147
+ {
148
+ "cell_type": "code",
149
+ "execution_count": null,
150
+ "metadata": {},
151
+ "outputs": [],
152
+ "source": [
153
+ "def package_to_hub(\n",
154
+ " repo_id,\n",
155
+ " model,\n",
156
+ " hyperparameters,\n",
157
+ " eval_env,\n",
158
+ " video_fps=30,\n",
159
+ " commit_message=\"Push agent to the Hub\",\n",
160
+ " token=None,\n",
161
+ " logs=None,\n",
162
+ "):\n",
163
+ " \"\"\"\n",
164
+ " Evaluate, Generate a video and Upload a model to Hugging Face Hub.\n",
165
+ " This method does the complete pipeline:\n",
166
+ " - It evaluates the model\n",
167
+ " - It generates the model card\n",
168
+ " - It generates a replay video of the agent\n",
169
+ " - It pushes everything to the hub\n",
170
+ " :param repo_id: id of the model repository from the Hugging Face Hub\n",
171
+ " :param model: trained model\n",
172
+ " :param eval_env: environment used to evaluate the agent\n",
173
+ " :param fps: number of fps for rendering the video\n",
174
+ " :param commit_message: commit message\n",
175
+ " :param logs: directory on local machine of tensorboard logs you'd like to upload\n",
176
+ " \"\"\"\n",
177
+ " msg.info(\n",
178
+ " \"This function will save, evaluate, generate a video of your agent, \"\n",
179
+ " \"create a model card and push everything to the hub. \"\n",
180
+ " \"It might take up to 1min. \\n \"\n",
181
+ " \"This is a work in progress: if you encounter a bug, please open an issue.\"\n",
182
+ " )\n",
183
+ " # Step 1: Clone or create the repo\n",
184
+ " repo_url = HfApi().create_repo(\n",
185
+ " repo_id=repo_id,\n",
186
+ " token=token,\n",
187
+ " private=False,\n",
188
+ " exist_ok=True,\n",
189
+ " )\n",
190
+ "\n",
191
+ " with tempfile.TemporaryDirectory() as tmpdirname:\n",
192
+ " tmpdirname = Path(\"./\")\n",
193
+ "\n",
194
+ " # Step 2: Save the model\n",
195
+ " torch.save(model.state_dict(), tmpdirname / \"model.pt\")\n",
196
+ "\n",
197
+ " # Step 3: Evaluate the model and build JSON\n",
198
+ " mean_reward, std_reward = _evaluate_agent(eval_env, 10, model)\n",
199
+ "\n",
200
+ " # First get datetime\n",
201
+ " eval_datetime = datetime.datetime.now()\n",
202
+ " eval_form_datetime = eval_datetime.isoformat()\n",
203
+ "\n",
204
+ " evaluate_data = {\n",
205
+ " \"env_id\": hyperparameters.env_id,\n",
206
+ " \"mean_reward\": mean_reward,\n",
207
+ " \"std_reward\": std_reward,\n",
208
+ " \"n_evaluation_episodes\": 10,\n",
209
+ " \"eval_datetime\": eval_form_datetime,\n",
210
+ " }\n",
211
+ "\n",
212
+ " # Write a JSON file\n",
213
+ " with open(tmpdirname / \"results.json\", \"w\") as outfile:\n",
214
+ " json.dump(evaluate_data, outfile)\n",
215
+ "\n",
216
+ " # Step 4: Generate a video\n",
217
+ " video_path = tmpdirname / \"replay.mp4\"\n",
218
+ " record_video(eval_env, model, video_path, video_fps)\n",
219
+ "\n",
220
+ " # Step 5: Generate the model card\n",
221
+ " generated_model_card, metadata = _generate_model_card(\n",
222
+ " \"PPO\", hyperparameters.env_id, mean_reward, std_reward, hyperparameters\n",
223
+ " )\n",
224
+ " _save_model_card(tmpdirname, generated_model_card, metadata)\n",
225
+ "\n",
226
+ " # Step 6: Add logs if needed\n",
227
+ " if logs:\n",
228
+ " _add_logdir(tmpdirname, Path(logs))\n",
229
+ "\n",
230
+ " msg.info(f\"Pushing repo {repo_id} to the Hugging Face Hub\")\n",
231
+ "\n",
232
+ " repo_url = upload_folder(\n",
233
+ " repo_id=repo_id,\n",
234
+ " folder_path=tmpdirname,\n",
235
+ " path_in_repo=\"\",\n",
236
+ " commit_message=commit_message,\n",
237
+ " token=token,\n",
238
+ " )\n",
239
+ "\n",
240
+ " msg.info(f\"Your model is pushed to the Hub. You can view your model here: {repo_url}\")\n",
241
+ " return repo_url"
242
+ ]
243
+ }
244
+ ],
245
+ "metadata": {
246
+ "kernelspec": {
247
+ "display_name": "Python 3.8.10 ('torch_venv')",
248
+ "language": "python",
249
+ "name": "python3"
250
+ },
251
+ "language_info": {
252
+ "codemirror_mode": {
253
+ "name": "ipython",
254
+ "version": 3
255
+ },
256
+ "file_extension": ".py",
257
+ "mimetype": "text/x-python",
258
+ "name": "python",
259
+ "nbconvert_exporter": "python",
260
+ "pygments_lexer": "ipython3",
261
+ "version": "3.8.10"
262
+ },
263
+ "orig_nbformat": 4,
264
+ "vscode": {
265
+ "interpreter": {
266
+ "hash": "745a3b3e3fb7ac09f0ebb6d5eb47d006584e16db5d9df6f9a8b654baa561b29f"
267
+ }
268
+ }
269
+ },
270
+ "nbformat": 4,
271
+ "nbformat_minor": 2
272
+ }
replay.mp4 ADDED
Binary file (36.8 kB). View file
 
results.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"env_id": "LunarLander-v2", "mean_reward": -138.9859847395379, "std_reward": 59.22793304996013, "n_evaluation_episodes": 10, "eval_datetime": "2023-03-20T13:32:08.124907"}
runs/LunarLander-v2__ppo__1__1679286470/events.out.tfevents.1679286470.hanbk-robotmecha.1077606.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e2eede0401338d192353e4049216de71541d7c7bf1f1443a3dc4c78a9ea4620e
3
+ size 111191
runs/LunarLander-v2__ppo__1__1679286545/events.out.tfevents.1679286545.hanbk-robotmecha.1077962.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:49b683c2873afb376f7e37ae0296bb028ca2df880329dc2f659d21406721cfd8
3
+ size 111191
runs/LunarLander-v2__ppo__1__1679286687/events.out.tfevents.1679286687.hanbk-robotmecha.1078677.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4dc4fb89347b487650dd9d904494de9c6299279c7d54a6fb5271a0a04e09fb1c
3
+ size 111205