File size: 5,118 Bytes

1d8092a

from typing import Callable

import gymnasium as gym
import torch
import torch.nn as nn

from dataclasses import dataclass

import os
import tyro


@dataclass
class Args:
    exp_name: str = os.path.basename(__file__)[: -len(".py")]
    """the name of this experiment"""
    seed: int = 1
    """seed of the experiment"""
    torch_deterministic: bool = True
    """if toggled, `torch.backends.cudnn.deterministic=False`"""
    cuda: bool = True
    """if toggled, cuda will be enabled by default"""
    track: bool = False
    """if toggled, this experiment will be tracked with Weights and Biases"""
    wandb_project_name: str = "cleanRL"
    """the wandb's project name"""
    wandb_entity: str = None
    """the entity (team) of wandb's project"""
    capture_video: bool = False
    """whether to capture videos of the agent performances (check out `videos` folder)"""
    save_model: bool = False
    """whether to save model into the `runs/{run_name}` folder"""
    upload_model: bool = False
    """whether to upload the saved model to huggingface"""
    hf_entity: str = "jacksonhack"
    """the user or org name of the model repository from the Hugging Face Hub"""

    # Algorithm specific arguments
    env_id: str = "Hopper-v5"
    """the environment id of the Atari game"""
    total_timesteps: int = 1000000
    """total timesteps of the experiments"""
    learning_rate: float = 3e-4
    """the learning rate of the optimizer"""
    buffer_size: int = int(1e6)
    """the replay memory buffer size"""
    gamma: float = 0.99
    """the discount factor gamma"""
    tau: float = 0.005
    """target smoothing coefficient (default: 0.005)"""
    batch_size: int = 256
    """the batch size of sample from the reply memory"""
    exploration_noise: float = 0.1
    """the scale of exploration noise"""
    learning_starts: int = 25e3
    """timestep to start learning"""
    policy_frequency: int = 2
    """the frequency of training policy (delayed)"""
    noise_clip: float = 0.5
    """noise clip parameter of the Target Policy Smoothing Regularization"""


def evaluate(
    model_path: str,
    make_env: Callable,
    env_id: str,
    eval_episodes: int,
    run_name: str,
    Model: nn.Module,
    device: torch.device = torch.device("cpu"),
    capture_video: bool = True,
    exploration_noise: float = 0.1,
):
    envs = gym.vector.SyncVectorEnv([make_env(env_id, 0, 0, capture_video, run_name)])
    actor = Model[0](envs).to(device)
    qf = Model[1](envs).to(device)
    actor_params, qf_params = torch.load(model_path, map_location=device)
    actor.load_state_dict(actor_params)
    actor.eval()
    qf.load_state_dict(qf_params)
    qf.eval()
    # note: qf is not used in this script

    obs, _ = envs.reset()
    episodic_returns = []
    while len(episodic_returns) < eval_episodes:
        with torch.no_grad():
            actions = actor(torch.Tensor(obs).to(device))
            actions += torch.normal(0, actor.action_scale * exploration_noise)
            actions = actions.cpu().numpy().clip(envs.single_action_space.low, envs.single_action_space.high)

        next_obs, _, _, _, infos = envs.step(actions)

        if "episode" in infos:
            print(f"eval_episode={len(episodic_returns)}, episodic_return={infos['episode']['r'][infos['_episode']]}")
            episodic_returns += [infos["episode"]["r"]]
            # print(f"global_step={global_step}, episode_return={infos['episode']['r'][infos['_episode']][0]}")
            # writer.add_scalar("charts/episodic_return", infos["episode"]["r"][infos["_episode"]][0], global_step)
            # writer.add_scalar("charts/episodic_length", infos["episode"]["l"][infos["_episode"]][0], global_step)

        # if "final_info" in infos:
        #     for info in infos["final_info"]:
        #         if "episode" not in info:
        #             continue
        #         print(f"eval_episode={len(episodic_returns)}, episodic_return={info['episode']['r']}")
        #         episodic_returns += [info["episode"]["r"]]

        obs = next_obs

    print(f"episodic_returns: {episodic_returns}")
    return episodic_returns


if __name__ == "__main__":
    # from huggingface_hub import hf_hub_download

    from rl.ddpg import Actor, QNetwork, make_env

    # model_path = hf_hub_download(
    #     repo_id="cleanrl/HalfCheetah-v4-ddpg_continuous_action-seed1", filename="ddpg_continuous_action.cleanrl_model"
    # )
    run_name = "Hopper-v5__ddpg__1__1732697106"
    model_path = "runs/Hopper-v5__ddpg__1__1732697106/ddpg.cleanrl_model"
    episodic_returns = evaluate(
        model_path,
        make_env,
        "Hopper-v5",
        eval_episodes=10,
        run_name=f"{run_name}-eval",
        Model=(Actor, QNetwork),
        device="cpu",
        capture_video=True,
    )

    from rl_utils.huggingface import push_to_hub

    args = tyro.cli(Args)

    repo_name = f"{args.env_id}-{args.exp_name}-seed{args.seed}"
    repo_id = f"{args.hf_entity}/{repo_name}" if args.hf_entity else repo_name
    push_to_hub(args, episodic_returns, repo_id, "DDPG", f"runs/{run_name}", f"videos/{run_name}-eval")