Nharen
/

Reward_Rush_SAC_Half_Cheetah

@@ -41,15 +41,15 @@ The SAC actor is a multi-layer perceptron with the following specifications:
   - Linear(17, 256) -> ReLU
   - Linear(256, 256) -> ReLU
   - Linear(256, 6) for `mean` + Linear(256, 6) for `log_std`
-- **Note:** The actor outputs the mean and log standard deviation for each action. For inference, only the mean is used, passed through a tanh activation to bound actions to [-1, 1].
 ## Common Mistakes to Avoid
-- **Layer Names:** The checkpoint uses `mean` and `log_std` layers. Do not try to map them to a generic `nn.Sequential` unless keys are remapped.
-- **Output Dimensions:** Ensure the model matches the checkpoint (6 actions).
-- **Continuous Actions:** HalfCheetah requires a numpy array for actions. Flatten tensors and convert to numpy.
-- **Episode Evaluation:** Always test over full episodes (100 episodes recommended) to properly evaluate policy performance.
-- **Checkpoint Loading:** Use `weights_only=True` with `torch.load` to safely load state dicts.
 ## Download and Test Code
@@ -60,58 +60,69 @@ import gymnasium as gym
 import numpy as np
 from huggingface_hub import hf_hub_download
 class SACActor(nn.Module):
-    def __init__(self, obs_dim=17, action_dim=6, hidden_dim=256):
-        super(SACActor, self).__init__()
-        self.fc1 = nn.Linear(obs_dim, hidden_dim)
-        self.fc2 = nn.Linear(hidden_dim, hidden_dim)
-        self.mean = nn.Linear(hidden_dim, action_dim)
-        self.log_std = nn.Linear(hidden_dim, action_dim)
-        self.relu = nn.ReLU()
-    def forward(self, x):
-        x = self.relu(self.fc1(x))
-        x = self.relu(self.fc2(x))
         mean = self.mean(x)
         return torch.tanh(mean)
-def run_half_cheetah_test():
-    path = hf_hub_download(
-        repo_id="Nharen/Reward_Rush_SAC_Half_Cheetah",
-        filename="half_cheetah.pth"
-    )
-    model = SACActor()
-    state_dict = torch.load(path, map_location='cpu', weights_only=True)
-    if isinstance(state_dict, dict) and "actor_state_dict" in state_dict:
-        state_dict = state_dict["actor_state_dict"]
-    elif isinstance(state_dict, dict) and "state_dict" in state_dict:
-        state_dict = state_dict["state_dict"]
-    model.load_state_dict(state_dict)
-    model.eval()
-    env = gym.make("HalfCheetah-v4")
-    total_rewards = []
-    for _ in range(100):
-        state, _ = env.reset()
-        episode_reward = 0
-        done = False
-        while not done:
-            state_t = torch.as_tensor(state, dtype=torch.float32).unsqueeze(0)
-            with torch.no_grad():
-                action = model(state_t).cpu().numpy().flatten()
-            state, reward, terminated, truncated, _ = env.step(action)
-            episode_reward += reward
-            done = terminated or truncated
-        total_rewards.append(episode_reward)
-    print(f"Average Reward over 100 episodes: {np.mean(total_rewards)}")
-    env.close()
-if __name__ == "__main__":
-    run_half_cheetah_test()
 ```

   - Linear(17, 256) -> ReLU
   - Linear(256, 256) -> ReLU
   - Linear(256, 6) for `mean` + Linear(256, 6) for `log_std`
+- **Note:** The actor outputs mean and log standard deviation for each action. For inference, only the mean is used, passed through a tanh activation to bound actions to [-1, 1].
 ## Common Mistakes to Avoid
+- **Layer Names:** The checkpoint uses `net`, `mean`, and `log_std`. Do not try to redefine layers with different names (`fc1`, `fc2`) unless you remap the keys.
+- **Output Dimensions:** Ensure the actor matches the checkpoint dimensions (6 actions).
+- **Continuous Actions:** HalfCheetah requires numpy arrays for actions. Flatten tensors and convert to numpy.
+- **Episode Evaluation:** Always test over full episodes (100 recommended) to properly evaluate performance.
+- **Checkpoint Loading:** Use `weights_only=True` when loading `.pth` state dicts for safety.
 ## Download and Test Code
 import numpy as np
 from huggingface_hub import hf_hub_download
+# Load stripped checkpoint
+ckpt = torch.load(
+    hf_hub_download("Nharen/Reward_Rush_SAC_Half_Cheetah", "half_cheetah.pth"),
+    weights_only=True
+)
+obs_dim = ckpt["obs_dim"]
+act_dim = ckpt["act_dim"]
+hidden_dim = ckpt.get("hidden_dim", 256)
+# SAC Gaussian Actor
 class SACActor(nn.Module):
+    def __init__(self, obs_dim, act_dim, hidden_dim=256):
+        super().__init__()
+        self.net = nn.Sequential(
+            nn.Linear(obs_dim, hidden_dim),
+            nn.ReLU(),
+            nn.Linear(hidden_dim, hidden_dim),
+            nn.ReLU()
+        )
+        self.mean = nn.Linear(hidden_dim, act_dim)
+        self.log_std = nn.Linear(hidden_dim, act_dim)
+    def forward(self, obs):
+        x = self.net(obs)
         mean = self.mean(x)
         return torch.tanh(mean)
+# Instantiate actor
+actor = SACActor(obs_dim, act_dim, hidden_dim)
+actor.load_state_dict(ckpt["actor_state_dict"])
+actor.eval()
+# Environment
+env = gym.make("HalfCheetah-v4")
+num_episodes = 100
+episode_rewards = []
+# Run evaluation
+for ep in range(num_episodes):
+    obs, _ = env.reset()
+    done = False
+    ep_reward = 0.0
+    while not done:
+        with torch.no_grad():
+            obs_t = torch.tensor(obs, dtype=torch.float32).unsqueeze(0)
+            action = actor(obs_t).squeeze(0).cpu().numpy()
+        obs, reward, terminated, truncated, _ = env.step(action)
+        ep_reward += reward
+        done = terminated or truncated
+    episode_rewards.append(ep_reward)
+    print(f"Episode {ep+1:3d} | Reward: {ep_reward:.2f}")
+env.close()
+# Results
+episode_rewards = np.array(episode_rewards)
+print("\n===== Evaluation Summary =====")
+print(f"Episodes run: {num_episodes}")
+print(f"Mean reward: {episode_rewards.mean():.2f}")
+print(f"Std reward:  {episode_rewards.std():.2f}")
+print(f"Min reward:  {episode_rewards.min():.2f}")
+print(f"Max reward:  {episode_rewards.max():.2f}")
 ```