Spaces:

acozma
/

CS581-Algos-Demo

Sleeping

App Files Files Community

Andrei Cozma commited on Apr 25, 2023

Commit

30bb976

1 Parent(s): 120dc90

Updates

Browse files

Files changed (8) hide show

MCAgent.py +10 -12
demo.py +3 -1
policies/{MCAgent_CliffWalking-v0_gamma:1.0_epsilon:0.4_e1500_s200_first_visit.npy → MCAgent_CliffWalking-v0_gamma:1.0_epsilon:0.4_type:onpolicy_e1500_s200.npy} +0 -0
policies/{MCAgent_FrozenLake-v1_gamma:1.0_epsilon:0.4_size:8_seed:35280_e1500_s200_first_visit.npy → MCAgent_FrozenLake-v1_gamma:1.0_epsilon:0.4_size:8_seed:35280_type:onpolicy_e1500_s200.npy} +0 -0
policies/{MCAgent_FrozenLake-v1_gamma:1.0_epsilon:0.4_size:8_seed:61252_e1500_s200_first_visit.npy → MCAgent_FrozenLake-v1_gamma:1.0_epsilon:0.4_size:8_seed:61252_type:onpolicy_e1500_s200.npy} +0 -0
policies/{MCAgent_FrozenLake-v1_gamma:1.0_epsilon:0.4_size:8_seed:96883_e1500_s200_first_visit.npy → MCAgent_FrozenLake-v1_gamma:1.0_epsilon:0.4_size:8_seed:96883_type:onpolicy_e1500_s200.npy} +0 -0
policies/{MCAgent_Taxi-v3_gamma:1.0_epsilon:0.75_e15000_s200_first_visit.npy → MCAgent_Taxi-v3_gamma:1.0_epsilon:0.75_type:onpolicy_e15000_s200.npy} +0 -0
run.py +4 -4

MCAgent.py CHANGED Viewed

@@ -5,12 +5,10 @@ from AgentBase import AgentBase
 class MCAgent(AgentBase):
-    def __init__(
-        self, /, update_type="on-policy", **kwargs  # "on-policy" or "off-policy
-    ):
         super().__init__(run_name=self.__class__.__name__, **kwargs)
-        self.update_type = update_type
-        self.run_name = f"{self.run_name}_{self.update_type}"
         self.initialize()
     def initialize(self):
@@ -23,13 +21,13 @@ class MCAgent(AgentBase):
         # self.Q = np.random.rand(self.n_states, self.n_actions)
         # self.Q = np.random.normal(0, 1, size=(self.n_states, self.n_actions))
-        if self.update_type.startswith("on_policy"):
             # For On-Policy update type:
             # R keeps track of all the returns that have been observed for each state-action pair to update Q
             self.R = [[[] for _ in range(self.n_actions)] for _ in range(self.n_states)]
             # An arbitrary e-greedy policy:
             self.Pi = self.create_soft_policy()
-        elif self.update_type.startswith("off_policy"):
             # For Off-Policy update type:
             self.C = np.zeros((self.n_states, self.n_actions))
             # Target policy is greedy with respect to the current Q (ties broken consistently)
@@ -39,7 +37,7 @@ class MCAgent(AgentBase):
             self.Pi_behaviour = self.create_soft_policy(coverage_policy=self.Pi)
         else:
             raise ValueError(
-                f"update_type must be either 'on_policy' or 'off_policy', but got {self.update_type}"
             )
         print("=" * 80)
         print("Initial policy:")
@@ -67,7 +65,7 @@ class MCAgent(AgentBase):
         )
         return Pi
-    def update_on_policy(self, episode_hist):
         G = 0.0
         # For each step of the episode, in reverse order
         for t in range(len(episode_hist) - 1, -1, -1):
@@ -106,7 +104,7 @@ class MCAgent(AgentBase):
     #             1 - self.epsilon + self.epsilon / self.n_actions
     #         )
-    def update_off_policy(self, episode_hist):
         G, W = 0.0, 1.0
         for t in range(len(episode_hist) - 1, -1, -1):
             state, action, reward = episode_hist[t]
@@ -154,7 +152,7 @@ class MCAgent(AgentBase):
             "avg_ep_len": avg_ep_len,
         }
-        update_func = getattr(self, f"update_{self.update_type}")
         tqrange = tqdm(range(n_train_episodes))
         tqrange.set_description("Training")
@@ -163,7 +161,7 @@ class MCAgent(AgentBase):
             self.wandb_log_img(episode=None)
         for e in tqrange:
-            policy = self.Pi_behaviour if self.update_type == "off_policy" else self.Pi
             episode_hist, solved, _ = self.run_episode(policy=policy, **kwargs)
             rewards = [x[2] for x in episode_hist]
             total_reward, avg_reward = sum(rewards), np.mean(rewards)

 class MCAgent(AgentBase):
+    def __init__(self, /, type="onpolicy", **kwargs):  # "on-policy" or "off-policy
         super().__init__(run_name=self.__class__.__name__, **kwargs)
+        self.type = type
+        self.run_name += f"_type:{self.type}"
         self.initialize()
     def initialize(self):
         # self.Q = np.random.rand(self.n_states, self.n_actions)
         # self.Q = np.random.normal(0, 1, size=(self.n_states, self.n_actions))
+        if self.type.startswith("onpolicy"):
             # For On-Policy update type:
             # R keeps track of all the returns that have been observed for each state-action pair to update Q
             self.R = [[[] for _ in range(self.n_actions)] for _ in range(self.n_states)]
             # An arbitrary e-greedy policy:
             self.Pi = self.create_soft_policy()
+        elif self.type.startswith("offpolicy"):
             # For Off-Policy update type:
             self.C = np.zeros((self.n_states, self.n_actions))
             # Target policy is greedy with respect to the current Q (ties broken consistently)
             self.Pi_behaviour = self.create_soft_policy(coverage_policy=self.Pi)
         else:
             raise ValueError(
+                f"Parameter 'type' must be either 'onpolicy' or 'offpolicy', but got '{self.type}'"
             )
         print("=" * 80)
         print("Initial policy:")
         )
         return Pi
+    def update_onpolicy(self, episode_hist):
         G = 0.0
         # For each step of the episode, in reverse order
         for t in range(len(episode_hist) - 1, -1, -1):
     #             1 - self.epsilon + self.epsilon / self.n_actions
     #         )
+    def update_offpolicy(self, episode_hist):
         G, W = 0.0, 1.0
         for t in range(len(episode_hist) - 1, -1, -1):
             state, action, reward = episode_hist[t]
             "avg_ep_len": avg_ep_len,
         }
+        update_func = getattr(self, f"update_{self.type}")
         tqrange = tqdm(range(n_train_episodes))
         tqrange.set_description("Training")
             self.wandb_log_img(episode=None)
         for e in tqrange:
+            policy = self.Pi_behaviour if self.type == "off_policy" else self.Pi
             episode_hist, solved, _ = self.run_episode(policy=policy, **kwargs)
             rewards = [x[2] for x in episode_hist]
             total_reward, avg_reward = sum(rewards), np.mean(rewards)

demo.py CHANGED Viewed

@@ -154,7 +154,8 @@ def run(
         agent = load_agent(
             policy_path, return_agent_env_keys=True, render_mode="rgb_array"
         )
-    except ValueError:
         yield localstate, None, None, None, None, None, None, None, None, None, None, "🚫 Please select a valid policy file."
         return
@@ -185,6 +186,7 @@ def run(
         for step, (episode_hist, solved, frame_env) in enumerate(
             agent.generate_episode(
                 max_steps=max_steps,
                 render=True,
             )

         agent = load_agent(
             policy_path, return_agent_env_keys=True, render_mode="rgb_array"
         )
+    except ValueError as e:
+        print(f"🚫 Error: {e}")
         yield localstate, None, None, None, None, None, None, None, None, None, None, "🚫 Please select a valid policy file."
         return
         for step, (episode_hist, solved, frame_env) in enumerate(
             agent.generate_episode(
+                policy=agent.Pi,
                 max_steps=max_steps,
                 render=True,
             )

policies/{MCAgent_CliffWalking-v0_gamma:1.0_epsilon:0.4_e1500_s200_first_visit.npy → MCAgent_CliffWalking-v0_gamma:1.0_epsilon:0.4_type:onpolicy_e1500_s200.npy} RENAMED Viewed

File without changes

policies/{MCAgent_FrozenLake-v1_gamma:1.0_epsilon:0.4_size:8_seed:35280_e1500_s200_first_visit.npy → MCAgent_FrozenLake-v1_gamma:1.0_epsilon:0.4_size:8_seed:35280_type:onpolicy_e1500_s200.npy} RENAMED Viewed

File without changes

policies/{MCAgent_FrozenLake-v1_gamma:1.0_epsilon:0.4_size:8_seed:61252_e1500_s200_first_visit.npy → MCAgent_FrozenLake-v1_gamma:1.0_epsilon:0.4_size:8_seed:61252_type:onpolicy_e1500_s200.npy} RENAMED Viewed

File without changes

policies/{MCAgent_FrozenLake-v1_gamma:1.0_epsilon:0.4_size:8_seed:96883_e1500_s200_first_visit.npy → MCAgent_FrozenLake-v1_gamma:1.0_epsilon:0.4_size:8_seed:96883_type:onpolicy_e1500_s200.npy} RENAMED Viewed

File without changes

policies/{MCAgent_Taxi-v3_gamma:1.0_epsilon:0.75_e15000_s200_first_visit.npy → MCAgent_Taxi-v3_gamma:1.0_epsilon:0.75_type:onpolicy_e15000_s200.npy} RENAMED Viewed

File without changes

run.py CHANGED Viewed

@@ -66,11 +66,11 @@ def main():
     )
     parser.add_argument(
-        "--update_type",
         type=str,
-        choices=["on_policy", "off_policy"],
-        default="off_policy",
-        help="The type of update to use. Only supported by Monte-Carlo agent. (default: off_policy)",
     )
     ### Environment parameters

     )
     parser.add_argument(
+        "--type",
         type=str,
+        choices=["onpolicy", "offpolicy"],
+        default="onpolicy",
+        help="The type of update to use. Only supported by Monte-Carlo agent. (default: onpolicy)",
     )
     ### Environment parameters