| | import d4rl |
| | import gym |
| | import tqdm |
| | from diffusers.experimental import ValueGuidedRLPipeline |
| |
|
| |
|
| | config = { |
| | "n_samples": 64, |
| | "horizon": 32, |
| | "num_inference_steps": 20, |
| | "n_guide_steps": 2, |
| | "scale_grad_by_std": True, |
| | "scale": 0.1, |
| | "eta": 0.0, |
| | "t_grad_cutoff": 2, |
| | "device": "cpu", |
| | } |
| |
|
| |
|
| | if __name__ == "__main__": |
| | env_name = "hopper-medium-v2" |
| | env = gym.make(env_name) |
| |
|
| | pipeline = ValueGuidedRLPipeline.from_pretrained( |
| | "bglick13/hopper-medium-v2-value-function-hor32", |
| | env=env, |
| | ) |
| |
|
| | env.seed(0) |
| | obs = env.reset() |
| | total_reward = 0 |
| | total_score = 0 |
| | T = 1000 |
| | rollout = [obs.copy()] |
| | try: |
| | for t in tqdm.tqdm(range(T)): |
| | |
| | denorm_actions = pipeline(obs, planning_horizon=32) |
| |
|
| | |
| | next_observation, reward, terminal, _ = env.step(denorm_actions) |
| | score = env.get_normalized_score(total_reward) |
| |
|
| | |
| | total_reward += reward |
| | total_score += score |
| | print( |
| | f"Step: {t}, Reward: {reward}, Total Reward: {total_reward}, Score: {score}, Total Score:" |
| | f" {total_score}" |
| | ) |
| |
|
| | |
| | rollout.append(next_observation.copy()) |
| |
|
| | obs = next_observation |
| | except KeyboardInterrupt: |
| | pass |
| |
|
| | print(f"Total reward: {total_reward}") |
| |
|