00BER
/

lunar-lander-simulate

Model card Files Files and versions

xet

Community

00BER commited on Feb 12, 2023

Commit

85e4824

1 Parent(s): 4c6ae5f

Added lunar lander files

Browse files

Files changed (3) hide show

agent.py +849 -0
lunar_lander.py +332 -0
params.py +12 -0

agent.py ADDED Viewed

	@@ -0,0 +1,849 @@

+import torch
+import numpy as np
+import random
+import torch.nn as nn
+import copy
+import time, datetime
+import matplotlib.pyplot as plt
+from collections import deque
+from torch.utils.tensorboard import SummaryWriter
+class DQNet(nn.Module):
+    """mini cnn structure"""
+    def __init__(self, input_dim, output_dim):
+        super().__init__()
+        self.online = nn.Sequential(
+            nn.Linear(input_dim, 100),
+            nn.ReLU(),
+            nn.Linear(100, 120),
+            nn.ReLU(),
+            nn.Linear(120, output_dim),
+        )
+        self.target = copy.deepcopy(self.online)
+        # Q_target parameters are frozen.
+        for p in self.target.parameters():
+            p.requires_grad = False
+    def forward(self, input, model):
+        if model == "online":
+            return self.online(input)
+        elif model == "target":
+            return self.target(input)
+class MetricLogger:
+    def __init__(self, save_dir):
+        self.writer = SummaryWriter(log_dir=save_dir)
+        self.save_log = save_dir / "log"
+        with open(self.save_log, "w") as f:
+            f.write(
+                f"{'Episode':>8}{'Step':>8}{'Epsilon':>10}{'MeanReward':>15}"
+                f"{'MeanLength':>15}{'MeanLoss':>15}{'MeanQValue':>15}"
+                f"{'TimeDelta':>15}{'Time':>20}\n"
+            )
+        self.ep_rewards_plot = save_dir / "reward_plot.jpg"
+        self.ep_lengths_plot = save_dir / "length_plot.jpg"
+        self.ep_avg_losses_plot = save_dir / "loss_plot.jpg"
+        self.ep_avg_qs_plot = save_dir / "q_plot.jpg"
+        # History metrics
+        self.ep_rewards = []
+        self.ep_lengths = []
+        self.ep_avg_losses = []
+        self.ep_avg_qs = []
+        # Moving averages, added for every call to record()
+        self.moving_avg_ep_rewards = []
+        self.moving_avg_ep_lengths = []
+        self.moving_avg_ep_avg_losses = []
+        self.moving_avg_ep_avg_qs = []
+        # Current episode metric
+        self.init_episode()
+        # Timing
+        self.record_time = time.time()
+    def log_step(self, reward, loss, q):
+        self.curr_ep_reward += reward
+        self.curr_ep_length += 1
+        if loss:
+            self.curr_ep_loss += loss
+            self.curr_ep_q += q
+            self.curr_ep_loss_length += 1
+    def log_episode(self, episode_number):
+        "Mark end of episode"
+        self.ep_rewards.append(self.curr_ep_reward)
+        self.ep_lengths.append(self.curr_ep_length)
+        if self.curr_ep_loss_length == 0:
+            ep_avg_loss = 0
+            ep_avg_q = 0
+        else:
+            ep_avg_loss = np.round(self.curr_ep_loss / self.curr_ep_loss_length, 5)
+            ep_avg_q = np.round(self.curr_ep_q / self.curr_ep_loss_length, 5)
+        self.ep_avg_losses.append(ep_avg_loss)
+        self.ep_avg_qs.append(ep_avg_q)
+        self.writer.add_scalar("Avg Loss for episode", ep_avg_loss, episode_number)
+        self.writer.add_scalar("Avg Q value for episode", ep_avg_q, episode_number)
+        self.writer.flush()
+        self.init_episode()
+    def init_episode(self):
+        self.curr_ep_reward = 0.0
+        self.curr_ep_length = 0
+        self.curr_ep_loss = 0.0
+        self.curr_ep_q = 0.0
+        self.curr_ep_loss_length = 0
+    def record(self, episode, epsilon, step):
+        mean_ep_reward = np.round(np.mean(self.ep_rewards[-100:]), 3)
+        mean_ep_length = np.round(np.mean(self.ep_lengths[-100:]), 3)
+        mean_ep_loss = np.round(np.mean(self.ep_avg_losses[-100:]), 3)
+        mean_ep_q = np.round(np.mean(self.ep_avg_qs[-100:]), 3)
+        self.moving_avg_ep_rewards.append(mean_ep_reward)
+        self.moving_avg_ep_lengths.append(mean_ep_length)
+        self.moving_avg_ep_avg_losses.append(mean_ep_loss)
+        self.moving_avg_ep_avg_qs.append(mean_ep_q)
+        last_record_time = self.record_time
+        self.record_time = time.time()
+        time_since_last_record = np.round(self.record_time - last_record_time, 3)
+        print(
+            f"Episode {episode} - "
+            f"Step {step} - "
+            f"Epsilon {epsilon} - "
+            f"Mean Reward {mean_ep_reward} - "
+            f"Mean Length {mean_ep_length} - "
+            f"Mean Loss {mean_ep_loss} - "
+            f"Mean Q Value {mean_ep_q} - "
+            f"Time Delta {time_since_last_record} - "
+            f"Time {datetime.datetime.now().strftime('%Y-%m-%dT%H:%M:%S')}"
+        )
+        self.writer.add_scalar("Mean reward last 100 episodes", mean_ep_reward, episode)
+        self.writer.add_scalar("Mean length last 100 episodes", mean_ep_length, episode)
+        self.writer.add_scalar("Mean loss last 100 episodes", mean_ep_loss, episode)
+        self.writer.add_scalar("Mean reward last 100 episodes", mean_ep_reward, episode)
+        self.writer.add_scalar("Epsilon value", epsilon, episode)
+        self.writer.add_scalar("Mean Q Value last 100 episodes", mean_ep_q, episode)
+        self.writer.flush()
+        with open(self.save_log, "a") as f:
+            f.write(
+                f"{episode:8d}{step:8d}{epsilon:10.3f}"
+                f"{mean_ep_reward:15.3f}{mean_ep_length:15.3f}{mean_ep_loss:15.3f}{mean_ep_q:15.3f}"
+                f"{time_since_last_record:15.3f}"
+                f"{datetime.datetime.now().strftime('%Y-%m-%dT%H:%M:%S'):>20}\n"
+            )
+        for metric in ["ep_rewards", "ep_lengths", "ep_avg_losses", "ep_avg_qs"]:
+            plt.plot(getattr(self, f"moving_avg_{metric}"))
+            plt.savefig(getattr(self, f"{metric}_plot"))
+            plt.clf()
+class DQNAgent:
+    def __init__(self,
+                 state_dim,
+                 action_dim,
+                 save_dir,
+                 checkpoint=None,
+                 learning_rate=0.00025,
+                 max_memory_size=100000,
+                 batch_size=32,
+                 exploration_rate=1,
+                 exploration_rate_decay=0.9999999,
+                 exploration_rate_min=0.1,
+                 training_frequency=1,
+                 learning_starts=1000,
+                 target_network_sync_frequency=500,
+                 reset_exploration_rate=False,
+                 save_frequency=100000,
+                 gamma=0.9,
+                 load_replay_buffer=True):
+        self.state_dim = state_dim
+        self.action_dim = action_dim
+        self.max_memory_size = max_memory_size
+        self.memory = deque(maxlen=max_memory_size)
+        self.batch_size = batch_size
+        self.exploration_rate = exploration_rate
+        self.exploration_rate_decay = exploration_rate_decay
+        self.exploration_rate_min = exploration_rate_min
+        self.gamma = gamma
+        self.curr_step = 0
+        self.learning_starts = learning_starts  # min. experiences before training
+        self.training_frequency = training_frequency   # no. of experiences between updates to Q_online
+        self.target_network_sync_frequency = target_network_sync_frequency  # no. of experiences between Q_target & Q_online sync
+        self.save_every = save_frequency   # no. of experiences between saving the network
+        self.save_dir = save_dir
+        self.use_cuda = torch.cuda.is_available()
+        self.net = DQNet(self.state_dim, self.action_dim).float()
+        if self.use_cuda:
+            self.net = self.net.to(device='cuda')
+        if checkpoint:
+            self.load(checkpoint, reset_exploration_rate, load_replay_buffer)
+        self.optimizer = torch.optim.AdamW(self.net.parameters(), lr=learning_rate, amsgrad=True)
+        self.loss_fn = torch.nn.SmoothL1Loss()
+        # self.optimizer = torch.optim.Adam(self.net.parameters(), lr=learning_rate)
+        # self.loss_fn = torch.nn.MSELoss()
+    def act(self, state):
+        """
+        Given a state, choose an epsilon-greedy action and update value of step.
+        Inputs:
+        state(LazyFrame): A single observation of the current state, dimension is (state_dim)
+        Outputs:
+        action_idx (int): An integer representing which action the agent will perform
+        """
+        # EXPLORE
+        if np.random.rand() < self.exploration_rate:
+            action_idx = np.random.randint(self.action_dim)
+        # EXPLOIT
+        else:
+            state = torch.FloatTensor(state).cuda() if self.use_cuda else torch.FloatTensor(state)
+            state = state.unsqueeze(0)
+            action_values = self.net(state, model='online')
+            action_idx = torch.argmax(action_values, axis=1).item()
+        # decrease exploration_rate
+        self.exploration_rate *= self.exploration_rate_decay
+        self.exploration_rate = max(self.exploration_rate_min, self.exploration_rate)
+        # increment step
+        self.curr_step += 1
+        return action_idx
+    def cache(self, state, next_state, action, reward, done):
+        """
+        Store the experience to self.memory (replay buffer)
+        Inputs:
+        state (LazyFrame),
+        next_state (LazyFrame),
+        action (int),
+        reward (float),
+        done(bool))
+        """
+        state = torch.FloatTensor(state).cuda() if self.use_cuda else torch.FloatTensor(state)
+        next_state = torch.FloatTensor(next_state).cuda() if self.use_cuda else torch.FloatTensor(next_state)
+        action = torch.LongTensor([action]).cuda() if self.use_cuda else torch.LongTensor([action])
+        reward = torch.DoubleTensor([reward]).cuda() if self.use_cuda else torch.DoubleTensor([reward])
+        done = torch.BoolTensor([done]).cuda() if self.use_cuda else torch.BoolTensor([done])
+        self.memory.append( (state, next_state, action, reward, done,) )
+    def recall(self):
+        """
+        Retrieve a batch of experiences from memory
+        """
+        batch = random.sample(self.memory, self.batch_size)
+        state, next_state, action, reward, done = map(torch.stack, zip(*batch))
+        return state, next_state, action.squeeze(), reward.squeeze(), done.squeeze()
+    def td_estimate(self, states, actions):
+        actions = actions.reshape(-1, 1)
+        predicted_qs = self.net(states, model='online')# Q_online(s,a)
+        predicted_qs = predicted_qs.gather(1, actions)
+        return predicted_qs
+    @torch.no_grad()
+    def td_target(self, rewards, next_states, dones):
+        rewards = rewards.reshape(-1, 1)
+        dones = dones.reshape(-1, 1)
+        target_qs = self.net(next_states, model='target')
+        target_qs = torch.max(target_qs, dim=1).values
+        target_qs = target_qs.reshape(-1, 1)
+        target_qs[dones] = 0.0
+        return (rewards + (self.gamma * target_qs))
+    def update_Q_online(self, td_estimate, td_target) :
+        loss = self.loss_fn(td_estimate.float(), td_target.float())
+        self.optimizer.zero_grad()
+        loss.backward()
+        self.optimizer.step()
+        return loss.item()
+    def sync_Q_target(self):
+        self.net.target.load_state_dict(self.net.online.state_dict())
+    def learn(self):
+        if self.curr_step % self.target_network_sync_frequency == 0:
+            self.sync_Q_target()
+        if self.curr_step % self.save_every == 0:
+            self.save()
+        if self.curr_step < self.learning_starts:
+            return None, None
+        if self.curr_step % self.training_frequency != 0:
+            return None, None
+        # Sample from memory
+        state, next_state, action, reward, done = self.recall()
+        # Get TD Estimate
+        td_est = self.td_estimate(state, action)
+        # Get TD Target
+        td_tgt = self.td_target(reward, next_state, done)
+        # Backpropagate loss through Q_online
+        loss = self.update_Q_online(td_est, td_tgt)
+        return (td_est.mean().item(), loss)
+    def save(self):
+        save_path = self.save_dir / f"airstriker_net_{int(self.curr_step // self.save_every)}.chkpt"
+        torch.save(
+            dict(
+                model=self.net.state_dict(),
+                exploration_rate=self.exploration_rate,
+                replay_memory=self.memory
+            ),
+            save_path
+        )
+        print(f"Airstriker model saved to {save_path} at step {self.curr_step}")
+    def load(self, load_path, reset_exploration_rate, load_replay_buffer):
+        if not load_path.exists():
+            raise ValueError(f"{load_path} does not exist")
+        ckp = torch.load(load_path, map_location=('cuda' if self.use_cuda else 'cpu'))
+        exploration_rate = ckp.get('exploration_rate')
+        state_dict = ckp.get('model')
+        print(f"Loading model at {load_path} with exploration rate {exploration_rate}")
+        self.net.load_state_dict(state_dict)
+        if load_replay_buffer:
+            replay_memory = ckp.get('replay_memory')
+            print(f"Loading replay memory. Len {len(replay_memory)}" if replay_memory else "Saved replay memory not found. Not restoring replay memory.")
+            self.memory = replay_memory if replay_memory else self.memory
+        if reset_exploration_rate:
+            print(f"Reset exploration rate option specified. Not restoring saved exploration rate {exploration_rate}. The current exploration rate is {self.exploration_rate}")
+        else:
+            print(f"Setting exploration rate to {exploration_rate} not loaded.")
+            self.exploration_rate = exploration_rate
+class DDQNAgent(DQNAgent):
+    @torch.no_grad()
+    def td_target(self, rewards, next_states, dones):
+        rewards = rewards.reshape(-1, 1)
+        dones = dones.reshape(-1, 1)
+        q_vals = self.net(next_states, model='online')
+        target_actions = torch.argmax(q_vals, axis=1)
+        target_actions = target_actions.reshape(-1, 1)
+        target_qs = self.net(next_states, model='target')
+        target_qs = target_qs.gather(1, target_actions)
+        target_qs = target_qs.reshape(-1, 1)
+        target_qs[dones] = 0.0
+        return (rewards + (self.gamma * target_qs))
+class DuelingDQNet(nn.Module):
+    def __init__(self, input_dim, output_dim):
+        super().__init__()
+        self.feature_layer = nn.Sequential(
+            nn.Linear(input_dim, 150),
+            nn.ReLU(),
+            nn.Linear(150, 120),
+            nn.ReLU()
+        )
+        self.value_layer = nn.Sequential(
+            nn.Linear(120, 120),
+            nn.ReLU(),
+            nn.Linear(120, 1)
+        )
+        self.advantage_layer = nn.Sequential(
+            nn.Linear(120, 120),
+            nn.ReLU(),
+            nn.Linear(120, output_dim)
+        )
+    def forward(self, state):
+        feature_output = self.feature_layer(state)
+        # feature_output = feature_output.view(feature_output.size(0), -1)
+        value = self.value_layer(feature_output)
+        advantage = self.advantage_layer(feature_output)
+        q_value = value + (advantage - advantage.mean())
+        return q_value
+class DuelingDQNAgent:
+    def __init__(self,
+                 state_dim,
+                 action_dim,
+                 save_dir,
+                 checkpoint=None,
+                 learning_rate=0.00025,
+                 max_memory_size=100000,
+                 batch_size=32,
+                 exploration_rate=1,
+                 exploration_rate_decay=0.9999999,
+                 exploration_rate_min=0.1,
+                 training_frequency=1,
+                 learning_starts=1000,
+                 target_network_sync_frequency=500,
+                 reset_exploration_rate=False,
+                 save_frequency=100000,
+                 gamma=0.9,
+                 load_replay_buffer=True):
+        self.state_dim = state_dim
+        self.action_dim = action_dim
+        self.max_memory_size = max_memory_size
+        self.memory = deque(maxlen=max_memory_size)
+        self.batch_size = batch_size
+        self.exploration_rate = exploration_rate
+        self.exploration_rate_decay = exploration_rate_decay
+        self.exploration_rate_min = exploration_rate_min
+        self.gamma = gamma
+        self.curr_step = 0
+        self.learning_starts = learning_starts  # min. experiences before training
+        self.training_frequency = training_frequency   # no. of experiences between updates to Q_online
+        self.target_network_sync_frequency = target_network_sync_frequency  # no. of experiences between Q_target & Q_online sync
+        self.save_every = save_frequency   # no. of experiences between saving the network
+        self.save_dir = save_dir
+        self.use_cuda = torch.cuda.is_available()
+        self.online_net = DuelingDQNet(self.state_dim, self.action_dim).float()
+        self.target_net = copy.deepcopy(self.online_net)
+        # Q_target parameters are frozen.
+        for p in self.target_net.parameters():
+            p.requires_grad = False
+        if self.use_cuda:
+            self.online_net = self.online_net(device='cuda')
+            self.target_net = self.target_net(device='cuda')
+        if checkpoint:
+            self.load(checkpoint, reset_exploration_rate, load_replay_buffer)
+        self.optimizer = torch.optim.AdamW(self.online_net.parameters(), lr=learning_rate, amsgrad=True)
+        self.loss_fn = torch.nn.SmoothL1Loss()
+        # self.optimizer = torch.optim.Adam(self.online_net.parameters(), lr=learning_rate)
+        # self.loss_fn = torch.nn.MSELoss()
+    def act(self, state):
+        """
+        Given a state, choose an epsilon-greedy action and update value of step.
+        Inputs:
+        state(LazyFrame): A single observation of the current state, dimension is (state_dim)
+        Outputs:
+        action_idx (int): An integer representing which action the agent will perform
+        """
+        # EXPLORE
+        if np.random.rand() < self.exploration_rate:
+            action_idx = np.random.randint(self.action_dim)
+        # EXPLOIT
+        else:
+            state = torch.FloatTensor(state).cuda() if self.use_cuda else torch.FloatTensor(state)
+            state = state.unsqueeze(0)
+            action_values = self.online_net(state)
+            action_idx = torch.argmax(action_values, axis=1).item()
+        # decrease exploration_rate
+        self.exploration_rate *= self.exploration_rate_decay
+        self.exploration_rate = max(self.exploration_rate_min, self.exploration_rate)
+        # increment step
+        self.curr_step += 1
+        return action_idx
+    def cache(self, state, next_state, action, reward, done):
+        """
+        Store the experience to self.memory (replay buffer)
+        Inputs:
+        state (LazyFrame),
+        next_state (LazyFrame),
+        action (int),
+        reward (float),
+        done(bool))
+        """
+        print("####################################")
+        print(state)
+        state = torch.FloatTensor(state).cuda() if self.use_cuda else torch.FloatTensor(state)
+        next_state = torch.FloatTensor(next_state).cuda() if self.use_cuda else torch.FloatTensor(next_state)
+        action = torch.LongTensor([action]).cuda() if self.use_cuda else torch.LongTensor([action])
+        reward = torch.DoubleTensor([reward]).cuda() if self.use_cuda else torch.DoubleTensor([reward])
+        done = torch.BoolTensor([done]).cuda() if self.use_cuda else torch.BoolTensor([done])
+        self.memory.append( (state, next_state, action, reward, done,) )
+    def recall(self):
+        """
+        Retrieve a batch of experiences from memory
+        """
+        batch = random.sample(self.memory, self.batch_size)
+        state, next_state, action, reward, done = map(torch.stack, zip(*batch))
+        return state, next_state, action.squeeze(), reward.squeeze(), done.squeeze()
+    def td_estimate(self, states, actions):
+        actions = actions.reshape(-1, 1)
+        predicted_qs = self.online_net(states)# Q_online(s,a)
+        predicted_qs = predicted_qs.gather(1, actions)
+        return predicted_qs
+    @torch.no_grad()
+    def td_target(self, rewards, next_states, dones):
+        rewards = rewards.reshape(-1, 1)
+        dones = dones.reshape(-1, 1)
+        target_qs = self.target_net.forward(next_states)
+        target_qs = torch.max(target_qs, dim=1).values
+        target_qs = target_qs.reshape(-1, 1)
+        target_qs[dones] = 0.0
+        return (rewards + (self.gamma * target_qs))
+    def update_Q_online(self, td_estimate, td_target) :
+        loss = self.loss_fn(td_estimate.float(), td_target.float())
+        self.optimizer.zero_grad()
+        loss.backward()
+        self.optimizer.step()
+        return loss.item()
+    def sync_Q_target(self):
+        self.target_net.load_state_dict(self.online_net.state_dict())
+    def learn(self):
+        if self.curr_step % self.target_network_sync_frequency == 0:
+            self.sync_Q_target()
+        if self.curr_step % self.save_every == 0:
+            self.save()
+        if self.curr_step < self.learning_starts:
+            return None, None
+        if self.curr_step % self.training_frequency != 0:
+            return None, None
+        # Sample from memory
+        state, next_state, action, reward, done = self.recall()
+        # Get TD Estimate
+        td_est = self.td_estimate(state, action)
+        # Get TD Target
+        td_tgt = self.td_target(reward, next_state, done)
+        # Backpropagate loss through Q_online
+        loss = self.update_Q_online(td_est, td_tgt)
+        return (td_est.mean().item(), loss)
+    def save(self):
+        save_path = self.save_dir / f"airstriker_net_{int(self.curr_step // self.save_every)}.chkpt"
+        torch.save(
+            dict(
+                model=self.online_net.state_dict(),
+                exploration_rate=self.exploration_rate,
+                replay_memory=self.memory
+            ),
+            save_path
+        )
+        print(f"Airstriker model saved to {save_path} at step {self.curr_step}")
+    def load(self, load_path, reset_exploration_rate, load_replay_buffer):
+        if not load_path.exists():
+            raise ValueError(f"{load_path} does not exist")
+        ckp = torch.load(load_path, map_location=('cuda' if self.use_cuda else 'cpu'))
+        exploration_rate = ckp.get('exploration_rate')
+        state_dict = ckp.get('model')
+        print(f"Loading model at {load_path} with exploration rate {exploration_rate}")
+        self.online_net.load_state_dict(state_dict)
+        self.target_net = copy.deepcopy(self.online_net)
+        self.sync_Q_target()
+        if load_replay_buffer:
+            replay_memory = ckp.get('replay_memory')
+            print(f"Loading replay memory. Len {len(replay_memory)}" if replay_memory else "Saved replay memory not found. Not restoring replay memory.")
+            self.memory = replay_memory if replay_memory else self.memory
+        if reset_exploration_rate:
+            print(f"Reset exploration rate option specified. Not restoring saved exploration rate {exploration_rate}. The current exploration rate is {self.exploration_rate}")
+        else:
+            print(f"Setting exploration rate to {exploration_rate} not loaded.")
+            self.exploration_rate = exploration_rate
+class DuelingDDQNAgent(DuelingDQNAgent):
+    @torch.no_grad()
+    def td_target(self, rewards, next_states, dones):
+        rewards = rewards.reshape(-1, 1)
+        dones = dones.reshape(-1, 1)
+        q_vals = self.online_net.forward(next_states)
+        target_actions = torch.argmax(q_vals, axis=1)
+        target_actions = target_actions.reshape(-1, 1)
+        target_qs = self.target_net.forward(next_states)
+        target_qs = target_qs.gather(1, target_actions)
+        target_qs = target_qs.reshape(-1, 1)
+        target_qs[dones] = 0.0
+        return (rewards + (self.gamma * target_qs))
+class DQNAgentWithStepDecay:
+    def __init__(self,
+                 state_dim,
+                 action_dim,
+                 save_dir,
+                 checkpoint=None,
+                 learning_rate=0.00025,
+                 max_memory_size=100000,
+                 batch_size=32,
+                 exploration_rate=1,
+                 exploration_rate_decay=0.9999999,
+                 exploration_rate_min=0.1,
+                 training_frequency=1,
+                 learning_starts=1000,
+                 target_network_sync_frequency=500,
+                 reset_exploration_rate=False,
+                 save_frequency=100000,
+                 gamma=0.9,
+                 load_replay_buffer=True):
+        self.state_dim = state_dim
+        self.action_dim = action_dim
+        self.max_memory_size = max_memory_size
+        self.memory = deque(maxlen=max_memory_size)
+        self.batch_size = batch_size
+        self.exploration_rate = exploration_rate
+        self.exploration_rate_decay = exploration_rate_decay
+        self.exploration_rate_min = exploration_rate_min
+        self.gamma = gamma
+        self.curr_step = 0
+        self.learning_starts = learning_starts  # min. experiences before training
+        self.training_frequency = training_frequency   # no. of experiences between updates to Q_online
+        self.target_network_sync_frequency = target_network_sync_frequency  # no. of experiences between Q_target & Q_online sync
+        self.save_every = save_frequency   # no. of experiences between saving the network
+        self.save_dir = save_dir
+        self.use_cuda = torch.cuda.is_available()
+        self.net = DQNet(self.state_dim, self.action_dim).float()
+        if self.use_cuda:
+            self.net = self.net.to(device='cuda')
+        if checkpoint:
+            self.load(checkpoint, reset_exploration_rate, load_replay_buffer)
+        self.optimizer = torch.optim.AdamW(self.net.parameters(), lr=learning_rate, amsgrad=True)
+        self.loss_fn = torch.nn.SmoothL1Loss()
+        # self.optimizer = torch.optim.Adam(self.net.parameters(), lr=learning_rate)
+        # self.loss_fn = torch.nn.MSELoss()
+    def act(self, state):
+        """
+        Given a state, choose an epsilon-greedy action and update value of step.
+        Inputs:
+        state(LazyFrame): A single observation of the current state, dimension is (state_dim)
+        Outputs:
+        action_idx (int): An integer representing which action the agent will perform
+        """
+        # EXPLORE
+        if np.random.rand() < self.exploration_rate:
+            action_idx = np.random.randint(self.action_dim)
+        # EXPLOIT
+        else:
+            state = torch.FloatTensor(state).cuda() if self.use_cuda else torch.FloatTensor(state)
+            state = state.unsqueeze(0)
+            action_values = self.net(state, model='online')
+            action_idx = torch.argmax(action_values, axis=1).item()
+        # decrease exploration_rate
+        self.exploration_rate *= self.exploration_rate_decay
+        self.exploration_rate = max(self.exploration_rate_min, self.exploration_rate)
+        # increment step
+        self.curr_step += 1
+        return action_idx
+    def cache(self, state, next_state, action, reward, done):
+        """
+        Store the experience to self.memory (replay buffer)
+        Inputs:
+        state (LazyFrame),
+        next_state (LazyFrame),
+        action (int),
+        reward (float),
+        done(bool))
+        """
+        state = torch.FloatTensor(state).cuda() if self.use_cuda else torch.FloatTensor(state)
+        next_state = torch.FloatTensor(next_state).cuda() if self.use_cuda else torch.FloatTensor(next_state)
+        action = torch.LongTensor([action]).cuda() if self.use_cuda else torch.LongTensor([action])
+        reward = torch.DoubleTensor([reward]).cuda() if self.use_cuda else torch.DoubleTensor([reward])
+        done = torch.BoolTensor([done]).cuda() if self.use_cuda else torch.BoolTensor([done])
+        self.memory.append( (state, next_state, action, reward, done) )
+    def recall(self):
+        """
+        Retrieve a batch of experiences from memory
+        """
+        batch = random.sample(self.memory, self.batch_size)
+        state, next_state, action, reward, done = map(torch.stack, zip(*batch))
+        return state, next_state, action.squeeze(), reward.squeeze(), done.squeeze()
+    def td_estimate(self, states, actions):
+        actions = actions.reshape(-1, 1)
+        predicted_qs = self.net(states, model='online')# Q_online(s,a)
+        predicted_qs = predicted_qs.gather(1, actions)
+        return predicted_qs
+    @torch.no_grad()
+    def td_target(self, rewards, next_states, dones):
+        rewards = rewards.reshape(-1, 1)
+        dones = dones.reshape(-1, 1)
+        target_qs = self.net(next_states, model='target')
+        target_qs = torch.max(target_qs, dim=1).values
+        target_qs = target_qs.reshape(-1, 1)
+        target_qs[dones] = 0.0
+        val = self.gamma * target_qs
+        return (rewards + val)
+    def update_Q_online(self, td_estimate, td_target) :
+        loss = self.loss_fn(td_estimate.float(), td_target.float())
+        self.optimizer.zero_grad()
+        loss.backward()
+        self.optimizer.step()
+        return loss.item()
+    def sync_Q_target(self):
+        self.net.target.load_state_dict(self.net.online.state_dict())
+    def learn(self):
+        if self.curr_step % self.target_network_sync_frequency == 0:
+            self.sync_Q_target()
+        if self.curr_step % self.save_every == 0:
+            self.save()
+        if self.curr_step < self.learning_starts:
+            return None, None
+        if self.curr_step % self.training_frequency != 0:
+            return None, None
+        # Sample from memory
+        state, next_state, action, reward, done = self.recall()
+        # Get TD Estimate
+        td_est = self.td_estimate(state, action)
+        # Get TD Target
+        td_tgt = self.td_target(reward, next_state, done)
+        # Backpropagate loss through Q_online
+        loss = self.update_Q_online(td_est, td_tgt)
+        return (td_est.mean().item(), loss)
+    def save(self):
+        save_path = self.save_dir / f"airstriker_net_{int(self.curr_step // self.save_every)}.chkpt"
+        torch.save(
+            dict(
+                model=self.net.state_dict(),
+                exploration_rate=self.exploration_rate,
+                replay_memory=self.memory
+            ),
+            save_path
+        )
+        print(f"Airstriker model saved to {save_path} at step {self.curr_step}")
+    def load(self, load_path, reset_exploration_rate, load_replay_buffer):
+        if not load_path.exists():
+            raise ValueError(f"{load_path} does not exist")
+        ckp = torch.load(load_path, map_location=('cuda' if self.use_cuda else 'cpu'))
+        exploration_rate = ckp.get('exploration_rate')
+        state_dict = ckp.get('model')
+        print(f"Loading model at {load_path} with exploration rate {exploration_rate}")
+        self.net.load_state_dict(state_dict)
+        if load_replay_buffer:
+            replay_memory = ckp.get('replay_memory')
+            print(f"Loading replay memory. Len {len(replay_memory)}" if replay_memory else "Saved replay memory not found. Not restoring replay memory.")
+            self.memory = replay_memory if replay_memory else self.memory
+        if reset_exploration_rate:
+            print(f"Reset exploration rate option specified. Not restoring saved exploration rate {exploration_rate}. The current exploration rate is {self.exploration_rate}")
+        else:
+            print(f"Setting exploration rate to {exploration_rate} not loaded.")
+            self.exploration_rate = exploration_rate

lunar_lander.py ADDED Viewed

	@@ -0,0 +1,332 @@

+# Copyright 2022 The HuggingFace Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# File inspired by source: https://github.com/openai/gym/blob/master/gym/envs/box2d/lunar_lander.py
+import argparse
+import time
+import os
+import numpy as np
+import simulate as sm
+import os
+from pathlib import Path
+from agent import DuelingDQNAgent, MetricLogger
+from params import hyperparams
+# This example reimplements the famous lunar lander reinforcement learning environment.
+# CONSTANTS From source
+# TODO implement scaling
+SCALE = 30.0  # affects how fast-paced the game is, forces should be adjusted as well
+# TODO integrate random initial forces
+INITIAL_RANDOM = 1000.0  # Set 1500 to make game harder
+# Lander construction
+LANDER_POLY = np.array([(-17, -10, 0), (-17, 0, 0), (-14, 17, 0), (14, 17, 0), (17, 0, 0), (17, -10, 0)])[::-1] / SCALE
+LEG_AWAY = 20
+LEG_DOWN = -7
+LEG_ANGLE = 0.25  # radians
+LEG_W, LEG_H = 2, 8
+LEG_RIGHT_POLY = (
+    np.array(
+        [
+            (LEG_AWAY, LEG_DOWN, 0),
+            (LEG_AWAY + LEG_H * np.sin(LEG_ANGLE), LEG_DOWN - LEG_H * np.cos(LEG_ANGLE), 0),
+            (
+                LEG_AWAY + LEG_H * np.sin(LEG_ANGLE) + LEG_W * np.sin(np.pi / 2 - LEG_ANGLE),
+                LEG_DOWN - LEG_H * np.cos(LEG_ANGLE) + LEG_W * np.cos(np.pi / 2 - LEG_ANGLE),
+                0,
+            ),
+            (LEG_AWAY + LEG_W * np.sin(np.pi / 2 - LEG_ANGLE), LEG_DOWN + LEG_W * np.cos(np.pi / 2 - LEG_ANGLE), 0),
+        ]
+    )
+    / SCALE
+)
+LEG_LEFT_POLY = [[-x, y, z] for x, y, z in LEG_RIGHT_POLY][::-1]
+LANDER_COLOR = [128 / 255, 102 / 255, 230 / 255]
+# terrain construction
+VIEWPORT_W = 600  # TODO integrate camera with these exact dimensions
+VIEWPORT_H = 400
+W = VIEWPORT_W / SCALE
+H = VIEWPORT_H / SCALE
+CHUNKS = 11
+HEIGHTS = np.random.uniform(0, H / 2, size=(CHUNKS + 1,))
+CHUNK_X = [W / (CHUNKS - 1) * i for i in range(CHUNKS)]
+HELIPAD_x1 = CHUNK_X[CHUNKS // 2 - 1]
+HELIPAD_x2 = CHUNK_X[CHUNKS // 2 + 1]
+HELIPAD_y = H / 4
+HEIGHTS[CHUNKS // 2 - 2] = HELIPAD_y
+HEIGHTS[CHUNKS // 2 - 1] = HELIPAD_y
+HEIGHTS[CHUNKS // 2 + 0] = HELIPAD_y
+HEIGHTS[CHUNKS // 2 + 1] = HELIPAD_y
+HEIGHTS[CHUNKS // 2 + 2] = HELIPAD_y
+SMOOTH_Y = [0.33 * (HEIGHTS[i - 1] + HEIGHTS[i + 0] + HEIGHTS[i + 1]) for i in range(CHUNKS)]
+# advanced features
+MAIN_ENGINE_POWER = 13.0  # TODO integrate specific forces
+SIDE_ENGINE_POWER = 0.6  # TODO integrate specific forces
+LEG_SPRING_TORQUE = 40  # TODO integrate specific forces
+SIDE_ENGINE_HEIGHT = 14.0  # TODO integrate specific forces
+SIDE_ENGINE_AWAY = 12.0  # TODO integrate specific forces
+LAND_POLY = (
+    [[CHUNK_X[0], SMOOTH_Y[0] - 3, 0]]
+    + [[x, y, 0] for x, y in zip(CHUNK_X, SMOOTH_Y)]
+    + [[CHUNK_X[-1], SMOOTH_Y[0] - 3, 0]]
+)
+def make_lander(engine="unity", engine_exe=""):
+    # Add sm scene
+    sc = sm.Scene(engine=engine, engine_exe=engine_exe)
+    # initial lander position sampling
+    lander_init_pos = (10, 15, 0) + np.random.uniform(2, 4, 3)
+    lander_init_pos[2] = 0.0  # z axis is always 0, for 2D
+    lander_material = sm.Material(base_color=LANDER_COLOR)
+    # create the lander polygons
+    # first, the main lander body
+    lander = sm.Polygon(
+        points=LANDER_POLY,
+        material=lander_material,
+        position=lander_init_pos,
+        name="lunar_lander",
+        is_actor=True,
+        physics_component=sm.RigidBodyComponent(
+            use_gravity=True,
+            constraints=["freeze_rotation_x", "freeze_rotation_y", "freeze_position_z"],
+            mass=1,
+        ),
+    )
+    # extrude to make 3D visually.
+    lander.mesh.extrude((0, 0, -1), capping=True, inplace=True)
+    lander.actuator = sm.Actuator(
+        mapping=[
+            sm.ActionMapping("add_force", axis=[1, 0, 0], amplitude=5),
+            sm.ActionMapping("add_force", axis=[1, 0, 0], amplitude=-5),
+            sm.ActionMapping("add_force", axis=[0, 1, 0], amplitude=2.5),
+        ],
+        n=3,
+    )
+    # add an invisible box as collider until convex meshes are completed
+    lander += sm.Box(
+        position=[0, np.min(LEG_RIGHT_POLY, axis=0)[1], -0.5],
+        bounds=[0.1, 2 * np.max(LEG_RIGHT_POLY, axis=0)[0], 1],
+        material=sm.Material.TRANSPARENT,
+        rotation=[0, 0, 90],
+        with_collider=True,
+        name="lander_collider_box_bottom",
+    )
+    lander += sm.Box(
+        position=[-0.6, 0, -0.5],
+        bounds=[0.1, 26 / SCALE, 1],
+        material=sm.Material.TRANSPARENT,
+        rotation=[0, 0, -15],
+        with_collider=True,
+        name="lander_collider_box_right",
+    )
+    lander += sm.Box(
+        position=[0.6, 0, -0.5],
+        bounds=[0.1, 26 / SCALE, 1],
+        material=sm.Material.TRANSPARENT,
+        rotation=[0, 0, 15],
+        with_collider=True,
+        name="lander_collider_box_left",
+    )
+    # add legs as children objects (they take positions as local coordinates!)
+    r_leg = sm.Polygon(
+        points=LEG_RIGHT_POLY,
+        material=lander_material,
+        parent=lander,
+        name="lander_r_leg",
+        # with_collider=True, # TODO can use this when convex colliders is added
+    )
+    r_leg.mesh.extrude((0, 0, -1), capping=True, inplace=True)
+    l_leg = sm.Polygon(
+        points=LEG_LEFT_POLY,
+        material=lander_material,
+        parent=lander,
+        name="lander_l_leg",
+        # with_collider=True, # TODO can use this when convex colliders is added
+    )
+    l_leg.mesh.extrude((0, 0, -1), capping=True, inplace=True)
+    # Create land object
+    land = sm.Polygon(
+        points=LAND_POLY[::-1],  # Reversing vertex order so the normal faces the right direction
+        material=sm.Material.GRAY,
+        name="Moon",
+    )
+    land.mesh.extrude((0, 0, -1), capping=True, inplace=True)
+    # Create collider blocks for the land (non-convex meshes are TODO)
+    for i in range(len(CHUNK_X) - 1):
+        x1, x2 = CHUNK_X[i], CHUNK_X[i + 1]
+        y1, y2 = SMOOTH_Y[i], SMOOTH_Y[i + 1]
+        # compute rotation from generated coordinates
+        rotation = [0, 0, +90 + np.degrees(np.arctan2(y2 - (y1 + y2) / 2, (x2 - x1) / 2))]
+        block_i = sm.Box(
+            position=[(x1 + x2) / 2, (y1 + y2) / 2, -0.5],
+            bounds=[0.2, 1.025 * np.sqrt((x2 - x1) ** 2 + (y2 - y1) ** 2), 1],  # adjustment for better colliders
+            material=sm.Material.GRAY,
+            rotation=rotation,
+            with_collider=True,
+            name="land_collider_" + str(i),
+        )
+        sc += block_i
+    # add target triangle / cone for reward
+    sc += sm.Cone(
+        position=[(HELIPAD_x1 + HELIPAD_x2) / 2, HELIPAD_y, -0.5],
+        height=10 / SCALE,
+        radius=10 / SCALE,
+        material=sm.Material.YELLOW,
+        name="target",
+    )
+    # TODO add lander state sensors for state-based RL
+    sc += sm.StateSensor(
+        target_entity=sc.target,
+        reference_entity=lander,
+        properties=["position", "rotation", "distance"],
+        name="goal_sense",
+    )
+    # create Euclidean distance reward, scalar changes the reward to a cost
+    cost = sm.RewardFunction(
+        type="dense", entity_a=lander, entity_b=sc.target, scalar=-1
+    )  # By default a dense reward equal to the distance between 2 entities
+    lander += cost
+    sc += lander
+    sc += land
+    return sc
+def get_values(state):
+    return state.get("StateSensor")
+def train(agent, env, logger):
+    episodes = 20000
+    for e in range(episodes):
+        state = env.reset()
+        # Play the game!
+        for i in range(100):
+            # Run agent on the state
+            action = agent.act(get_values(state))
+            # env.render()
+            # Agent performs action
+            next_state, reward, done, info = env.step(action)
+            print("####################")
+            print(done)
+            print("####################")
+            # Remember
+            agent.cache(get_values(state), get_values(next_state), action, reward, done)
+            # Learn
+            q, loss = agent.learn()
+            # Logging
+            logger.log_step(reward, loss, q)
+            # Update state
+            state = next_state
+            # Check if end of game
+            if done:
+                break
+        logger.log_episode(e)
+        if e % 20 == 0:
+            logger.record(episode=e, epsilon=agent.exploration_rate, step=agent.curr_step)
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--build_exe", default="", type=str, required=False, help="Pre-built unity app for simulate")
+    parser.add_argument(
+        "--num_steps", default=100, type=int, required=False, help="number of steps to run the simulator"
+    )
+    args = parser.parse_args()
+    sc = make_lander(engine="unity", engine_exe=args.build_exe)
+    sc += sm.LightSun()
+    env = sm.RLEnv(sc, frame_skip=1)
+    env.reset()
+    # for i in range(500):
+    #     print(sc.observation_space.sample())
+    #     action = [sc.action_space.sample()]
+    #     print("###############")
+    #     print(action)
+    #     obs, reward, done, info = env.step(action)
+    #     print(obs)
+    #     print(f"step {i}, reward {reward[0]}")
+    #     time.sleep(0.1)
+    # env.close()
+    checkpoint = None
+    # checkpoint = Path('checkpoints/latest/airstriker_net_3.chkpt')
+    path = "checkpoints/lunar-lander-dueling-dqn-rc"
+    save_dir = Path(path)
+    isExist = os.path.exists(path)
+    if not isExist:
+        os.makedirs(path)
+    logger = MetricLogger(save_dir)
+    print("Training Dueling DQN Agent with step decay!")
+    agent = DuelingDQNAgent(
+        state_dim=7,
+        action_dim=env.action_space.n,
+        save_dir=save_dir,
+        checkpoint=checkpoint,
+        **hyperparams
+    )
+    # print("Training Dueling DQN Agent!")
+    # agent = DuelingDQNAgent(
+    #     state_dim=8,
+    #     action_dim=env.action_space.n,
+    #     save_dir=save_dir,
+    #     checkpoint=checkpoint,
+    #     **hyperparams
+    # )
+    # fill_memory(agent, env, 5000)
+    train(agent, env, logger)

params.py ADDED Viewed

	@@ -0,0 +1,12 @@

+hyperparams = dict(
+    batch_size=128,
+    exploration_rate=1,
+    exploration_rate_decay=0.99999,
+    exploration_rate_min=0.01,
+    training_frequency=1,
+    target_network_sync_frequency=20,
+    max_memory_size=1000000,
+    learning_rate=0.001,
+    learning_starts=128,
+    save_frequency=100000
+)