| from __future__ import annotations |
|
|
| import datetime as dt |
| import json |
| import shutil |
| from pathlib import Path |
|
|
| import gymnasium as gym |
| import imageio |
| import numpy as np |
| from huggingface_hub import HfApi |
| from huggingface_hub.errors import HfHubHTTPError |
| from huggingface_hub.repocard import metadata_eval_result, metadata_save |
| from stable_baselines3 import PPO |
| from stable_baselines3.common.env_util import make_vec_env |
| from stable_baselines3.common.evaluation import evaluate_policy |
| from stable_baselines3.common.monitor import Monitor |
|
|
|
|
| USERNAME = "Sami94" |
| STUDENT_NAME = "Sami Chellia" |
| ENV_ID = "LunarLander-v2" |
| MODEL_NAME = "ppo-LunarLander-v2" |
| REPO_ID = f"{USERNAME}/{MODEL_NAME}" |
| OUTPUT_DIR = Path("artifacts/unit1") / MODEL_NAME |
|
|
|
|
| def evaluate(model: PPO, n_eval_episodes: int = 10) -> tuple[float, float]: |
| eval_env = Monitor(gym.make(ENV_ID, render_mode="rgb_array")) |
| mean_reward, std_reward = evaluate_policy( |
| model, |
| eval_env, |
| n_eval_episodes=n_eval_episodes, |
| deterministic=True, |
| ) |
| eval_env.close() |
| return float(mean_reward), float(std_reward) |
|
|
|
|
| def record_video(model: PPO, out_path: Path, max_steps: int = 1000) -> None: |
| env = gym.make(ENV_ID, render_mode="rgb_array") |
| obs, _ = env.reset(seed=42) |
| frames = [env.render()] |
| for _ in range(max_steps): |
| action, _ = model.predict(obs, deterministic=True) |
| obs, _, terminated, truncated, _ = env.step(action) |
| frames.append(env.render()) |
| if terminated or truncated: |
| break |
| env.close() |
| imageio.mimsave(out_path, [np.asarray(frame) for frame in frames], fps=30) |
|
|
|
|
| def save_artifacts(model: PPO, mean_reward: float, std_reward: float, timesteps: int) -> None: |
| if OUTPUT_DIR.exists(): |
| shutil.rmtree(OUTPUT_DIR) |
| OUTPUT_DIR.mkdir(parents=True, exist_ok=True) |
|
|
| model.save(OUTPUT_DIR / MODEL_NAME) |
| record_video(model, OUTPUT_DIR / "replay.mp4") |
|
|
| results = { |
| "env_id": ENV_ID, |
| "mean_reward": mean_reward, |
| "std_reward": std_reward, |
| "n_eval_episodes": 10, |
| "total_timesteps": timesteps, |
| "eval_datetime": dt.datetime.now().isoformat(), |
| "student": STUDENT_NAME, |
| "hf_username": USERNAME, |
| } |
| (OUTPUT_DIR / "results.json").write_text(json.dumps(results, indent=2), encoding="utf-8") |
|
|
| metadata = { |
| "tags": [ |
| ENV_ID, |
| "ppo", |
| "stable-baselines3", |
| "reinforcement-learning", |
| "huggingface-deep-rl-course", |
| ] |
| } |
| eval_metadata = metadata_eval_result( |
| model_pretty_name=MODEL_NAME, |
| task_pretty_name="reinforcement-learning", |
| task_id="reinforcement-learning", |
| metrics_pretty_name="mean_reward", |
| metrics_id="mean_reward", |
| metrics_value=f"{mean_reward:.2f} +/- {std_reward:.2f}", |
| dataset_pretty_name=ENV_ID, |
| dataset_id=ENV_ID, |
| ) |
| metadata = {**metadata, **eval_metadata} |
|
|
| readme_path = OUTPUT_DIR / "README.md" |
| readme_path.write_text( |
| f"""# PPO Agent for {ENV_ID} |
| |
| Student: {STUDENT_NAME} |
| Hugging Face username: {USERNAME} |
| |
| This repository contains a PPO agent trained for the Hugging Face Deep RL course. |
| |
| Mean reward: {mean_reward:.2f} +/- {std_reward:.2f} |
| Total timesteps: {timesteps} |
| |
| ```python |
| from huggingface_hub import hf_hub_download |
| from stable_baselines3 import PPO |
| |
| model_path = hf_hub_download(repo_id="{REPO_ID}", filename="{MODEL_NAME}.zip") |
| model = PPO.load(model_path) |
| ``` |
| """, |
| encoding="utf-8", |
| ) |
| metadata_save(readme_path, metadata) |
|
|
|
|
| def push_artifacts() -> str: |
| api = HfApi() |
| try: |
| repo_url = api.create_repo(repo_id=REPO_ID, repo_type="model", exist_ok=True) |
| api.upload_folder(repo_id=REPO_ID, repo_type="model", folder_path=OUTPUT_DIR, path_in_repo=".") |
| return str(repo_url) |
| except HfHubHTTPError as exc: |
| print(f"Hub push failed for {REPO_ID}: {exc}") |
| print(f"Local artifacts saved in {OUTPUT_DIR.resolve()}") |
| return f"LOCAL_ONLY:{OUTPUT_DIR.resolve()}" |
|
|
|
|
| def main() -> None: |
| env = make_vec_env(ENV_ID, n_envs=16) |
| model = PPO( |
| policy="MlpPolicy", |
| env=env, |
| n_steps=1024, |
| batch_size=64, |
| n_epochs=4, |
| gamma=0.999, |
| gae_lambda=0.98, |
| ent_coef=0.01, |
| verbose=1, |
| ) |
|
|
| total_timesteps = 0 |
| best: tuple[float, float] | None = None |
| for chunk in [200_000, 200_000, 200_000, 200_000, 200_000]: |
| model.learn(total_timesteps=chunk, reset_num_timesteps=False) |
| total_timesteps += chunk |
| mean_reward, std_reward = evaluate(model) |
| best = (mean_reward, std_reward) |
| print(f"Evaluation after {total_timesteps} timesteps: {mean_reward:.2f} +/- {std_reward:.2f}") |
| save_artifacts(model, mean_reward, std_reward, total_timesteps) |
| if mean_reward >= 200: |
| print("Certification threshold reached for Unit 1.") |
| break |
|
|
| env.close() |
| if best is None: |
| raise RuntimeError("Training finished without evaluation.") |
|
|
| url = push_artifacts() |
| (OUTPUT_DIR / "pushed_repo.json").write_text( |
| json.dumps({"repo_id": REPO_ID, "url": url}, indent=2), |
| encoding="utf-8", |
| ) |
| print(json.dumps({"repo_id": REPO_ID, "url": url, "mean_reward": best[0], "std_reward": best[1]}, indent=2)) |
|
|
|
|
| if __name__ == "__main__": |
| main() |
|
|