Spaces:

UZHedu
/

RL_Project20

Sleeping

App Files Files Community

Anoozh-Akileswaran commited on Nov 23, 2025

Commit

c3ec5ed

1 Parent(s): d937e11

First results from observation/return/reward norm.

Browse files

Files changed (32) hide show

CNN_PPO/ppo_helpers_cnn.py +2 -1
Observation_Advantage_Norm/PPO_Obser_Adva_Norm.py +0 -355
Observation_Advantage_Norm_diff_combo/ppo__rew_norm_obs_diff_combo.py +1254 -0
Observation_Advantage_Norm_diff_combo/ppo_rew_norm_obs_env_diff_combo.py +201 -0
Observation_Advantage_Norm_diff_env/ppo__rew_norm_obs_diff_env.py +891 -0
Observation_Advantage_Norm_diff_env/ppo_rew_norm_obs_env_diff_env.py +191 -0
Observation_Advantage_Norm_diff_hypo/Performance config for Learning Rate of update_advantage_norm.png +0 -0
Observation_Advantage_Norm_diff_hypo/Performance config for Learning Rate of update_observation_norm.png +0 -0
Observation_Advantage_Norm_diff_hypo/Performance config for Learning Rate of update_return_norm.png +0 -0
Observation_Advantage_Norm_diff_hypo/Performance config for Learning Rate of vanilla_ppo_update.png +0 -0
Observation_Advantage_Norm_diff_hypo/Performance config for entropy coefficient of update_advantage_norm.png +0 -0
Observation_Advantage_Norm_diff_hypo/Performance config for entropy coefficient of update_observation_norm.png +0 -0
Observation_Advantage_Norm_diff_hypo/Performance config for entropy coefficient of update_return_norm.png +0 -0
Observation_Advantage_Norm_diff_hypo/Performance config for entropy coefficient of vanilla_ppo_update.png +0 -0
Observation_Advantage_Norm_diff_hypo/Performance config for gamma value of update_advantage_norm.png +0 -0
Observation_Advantage_Norm_diff_hypo/Performance config for gamma value of update_observation_norm.png +0 -0
Observation_Advantage_Norm_diff_hypo/Performance config for gamma value of update_return_norm.png +0 -0
Observation_Advantage_Norm_diff_hypo/Performance config for gamma value of vanilla_ppo_update.png +0 -0
Observation_Advantage_Norm_diff_hypo/ppo__rew_norm_obs_diff_hyp.py +890 -0
Observation_Advantage_Norm/PPO_environment.py → Observation_Advantage_Norm_diff_hypo/ppo_rew_norm_obs_env_diff_hypo.py +109 -44
Observation_Advantage_Norm_in_batch/ppo__rew_norm_obs_in_batch.py +829 -0
Observation_Advantage_Norm_in_batch/ppo_rew_norm_obs_env_in_batch.py +163 -0
Observation_Advantage_Norm_in_batch/update_advantage_norm_in_batch.png +0 -0
Observation_Advantage_Norm_in_batch/update_observation_norm_in_batch.png +0 -0
Observation_Advantage_Norm_in_batch/update_return_norm_in_batch.png +0 -0
Observation_Advantage_Norm_in_batch/vanilla_ppo_update_in_batch.png +0 -0
Observation_Advantage_Norm_running_averages/ppo__rew_norm_obs_running_average.py +893 -0
Observation_Advantage_Norm_running_averages/ppo_rew_norm_obs_env_running_average.py +163 -0
Observation_Advantage_Norm_running_averages/update_advantage_norm_running_average_.png +0 -0
Observation_Advantage_Norm_running_averages/update_observation_norm_running_average_.png +0 -0
Observation_Advantage_Norm_running_averages/update_return_norm_running_average_.png +0 -0
Observation_Advantage_Norm_running_averages/vanilla_ppo_update_running_average_.png +0 -0

CNN_PPO/ppo_helpers_cnn.py CHANGED Viewed

@@ -144,7 +144,7 @@ class Agent:
             # Shuffle indices
             idxs = T.randperm(num_samples)
             for start in range(0, num_samples, batch_size):
-                batch_idx = idxs[start:start + batch_size]
                 b_states = states[batch_idx]
                 b_actions = actions[batch_idx]
@@ -187,6 +187,7 @@ class Agent:
         self.memory.clear()
         return total_loss_epoch / (ppo_epochs * (num_samples / batch_size))
     def update_rbs(self):

             # Shuffle indices
             idxs = T.randperm(num_samples)
             for start in range(0, num_samples, batch_size):
+                batch_idx = idxs[start:start + batch_size] #arrays with indices
                 b_states = states[batch_idx]
                 b_actions = actions[batch_idx]
         self.memory.clear()
         return total_loss_epoch / (ppo_epochs * (num_samples / batch_size))
+        #total loss per mini batch * ppo_epochs
     def update_rbs(self):

Observation_Advantage_Norm/PPO_Obser_Adva_Norm.py DELETED Viewed

@@ -1,355 +0,0 @@
-import numpy as np
-import torch as T
-import torch.nn as nn
-import torch.optim as optim
-from torch.distributions import Categorical
-class Agent():
-    # Minimal PPO-Clip agent (single full-batch update per episode, MC returns)
-    def __init__(
-            self,
-            obs_space,
-            action_space,
-            hidden,
-            gamma,
-            clip_coef,
-            lr,
-            value_coef,
-            entropy_coef,
-            seed
-    ):
-        # Initialize seed for reproducibility
-        if seed is not None:
-            np.random.seed(seed)
-            T.manual_seed(seed)
-        # Use GPU if available
-        self.device = T.device('cuda:0' if T.cuda.is_available() else 'cpu')
-        self.obs_dim = int(np.prod(getattr(obs_space, "shape", (obs_space,))))
-        self.action_dim = int(getattr(action_space, "n", action_space))
-        # Initialize the policy and the critic networks
-        self.policy = Policy(self.obs_dim, self.action_dim, hidden).to(self.device)
-        self.critic = Critic(self.obs_dim, hidden).to(self.device)
-        # Set optimizer for policy and critic networks
-        self.opt = optim.Adam(
-            list(self.policy.parameters()) + list(self.critic.parameters()),
-            lr=lr
-        )
-        # Initialize the hyperparameter
-        self.gamma = gamma
-        self.clip = clip_coef
-        self.value_coef = value_coef
-        self.entropy_coef = entropy_coef
-        # Initilize the memory to store the state, action, reward, ...
-        self.memory = Memory()
-        self.observationScaling = ObservationScaling()
-        self.advantageNorm = AdvantageNorm()
-        self.total_loss = 0
-    def choose_action(self, observation):
-        # Returns: action, log probabilitiy, value of the state
-        state = T.as_tensor(observation, dtype=T.float32, device=self.device).view(-1)
-        with T.no_grad():
-            # Forward function (defined in Policy class)
-            dist = self.policy.next_action(state)
-            # Sample from the action distribution
-            action = dist.sample()
-            logp = dist.log_prob(action)  # log πθ(a|s)
-            # Value the current state
-            value = self.critic.evaluated_state(state)
-        return int(action.item()), float(logp.item()), float(value.item())
-    def remember(self, state, action, reward, done, log_prob, value, next_state):
-        # Store the info
-        with T.no_grad():
-            # Pass on next state and have it evaluated by the critic network
-            ns = T.as_tensor(next_state, dtype=T.float32, device=self.device).view(-1)
-            next_value = self.critic.evaluated_state(ns).item()
-        self.memory.store(state, action, reward, done, log_prob, value, next_value)
-    """
-    def run_episode(self, env, max_steps: int, render: bool = False):
-        # Runs one episode, updates the policy once at the end
-        self.memory.clear()
-        out = env.reset()
-        state = out[0] if isinstance(out, tuple) else out
-        ep_return, ep_len = 0, 0
-        steps_limit = max_steps if max_steps is not None else float("inf")
-        while ep_len < steps_limit:
-            if render and hasattr(env, "render"):
-                env.render()
-            action, logp, value = self.choose_action(state)
-            step_out = env.step(action)
-            if len(step_out) == 5:
-                next_state, reward, terminated, truncated, _ = step_out
-                done = terminated or truncated
-            else:
-                next_state, reward, done, _ = step_out
-            self.remember(state, action, reward, done, logp, value, next_state)
-            ep_return += float(reward)
-            ep_len += 1
-            state = next_state
-            if done:
-                break
-        self._update()
-        return ep_return, ep_len
-    def run_episodes(self, env, n_episodes: int, max_steps: int, render: bool = False):
-        returns = []
-        for _ in range(n_episodes):
-            ep_ret, _ = self.run_episode(env, max_steps=max_steps, render=render)
-            returns.append(ep_ret)
-        return returns
-    """
-    def _update(self, mode, observationNorm, advantageNorm):
-        if len(self.memory.states) == 0:
-            return
-        states = T.as_tensor(np.array(self.memory.states), dtype=T.float32, device=self.device)
-        actions = T.as_tensor(self.memory.actions, dtype=T.long, device=self.device)
-        rewards = T.as_tensor(self.memory.rewards, dtype=T.float32, device=self.device)
-        dones = T.as_tensor(self.memory.dones, dtype=T.float32, device=self.device)
-        old_logp = T.as_tensor(self.memory.log_probs, dtype=T.float32, device=self.device)
-        values = T.as_tensor(self.memory.values, dtype=T.float32, device=self.device)
-        ###Normalization happening
-        if observationNorm == True:
-            self.observationScaling.update(states)
-            states = self.observationScaling.normalize(states)
-        ###
-        # Monte Carlo returns (episode-aware)
-        # Returns discounted sum of future rewards
-        with T.no_grad():
-            returns = T.zeros_like(rewards)
-            G = 0.0
-            for t in reversed(range(rewards.size(0))):
-                G = rewards[t] + self.gamma * G * (1.0 - dones[t])
-                returns[t] = G
-            # Compute Advantage +  advantage normalization in-batch
-            adv = returns - values
-            if advantageNorm == True:
-                self.advantageNorm.update(adv)
-                self.advantageNorm.normalize(adv)
-                #adv = (adv - adv.mean()) / (adv.std(unbiased=False) + 1e-8)
-        # Recompute distribution under the current policy
-        dist = self.policy.next_action(states)
-        new_logp = dist.log_prob(actions)
-        """PPO Components: Policy update, weighted probability distribution, clipped returns """
-        # Updating the policy: update probability distribution (i.e., compute clipped probs)
-        ratio = (new_logp - old_logp).exp()  # r_t = πθ / πθ_old KL divergence
-        # Weighted probaility distribution (according to the formula/update rule)
-        surr1 = ratio * adv
-        surr2 = T.clamp(ratio, 1 - self.clip, 1 + self.clip) * adv
-        value_pred = self.critic.evaluated_state(states)
-        beta = 1.0
-        target_kl = 0.01
-        #PPO standards
-        if mode == "clip":
-            surr1 = ratio * adv
-            surr2 = T.clamp(ratio, 1 - self.clip, 1 + self.clip) * adv
-            policy_loss = -T.min(surr1, surr2).mean()
-            print(f"Current policy loss: {policy_loss} with mode; {mode}")
-        elif mode == "kl_penalty":
-            #punish to policy if it changes too much
-            policy_loss = -(ratio * adv).mean()
-            approx_kl = (old_logp - new_logp).mean()
-            policy_loss = policy_loss + beta * approx_kl
-            # adapt beta toward target_kl as shown above
-            if approx_kl > 1.5 * target_kl:
-                beta *= 2.0  # too big a step → increase penalty
-            elif approx_kl < 0.5 * target_kl:
-                beta *= 0.5  # too small a step → allow bigger updates
-            print(f"Current policy loss: {policy_loss} with mode; {mode}")
-        elif mode == "unclipped_earlystop":
-            policy_loss = -(ratio * adv).mean()
-            approx_kl = (old_logp - new_logp).mean()
-            if approx_kl.item() > 1.5 * target_kl:
-                # skip optimizer step this update or end further epochs
-                print(f"Current policy loss: {policy_loss} with mode; {mode}")
-                self.memory.clear()
-                return
-        # Loss: MSE of (return - critic value)
-        value_loss = 0.5 * (returns - value_pred).pow(2).mean()
-        # Entropy (account for randomness in action selection)
-        entropy = dist.entropy().mean()
-        # Total loss: policy loss + constant * value loss - constant * entropy
-        self.total_loss = policy_loss + self.value_coef * value_loss - self.entropy_coef * entropy
-        self.opt.zero_grad(set_to_none=True)
-        self.total_loss.backward()
-        self.opt.step()
-        self.memory.clear()
-class Policy(nn.Module):
-    def __init__(self, obs_dim: int, action_dim: int, hidden: int):
-        super().__init__()
-        self.net = nn.Sequential(
-            nn.Linear(obs_dim, hidden),
-            nn.ReLU(),
-            nn.Linear(hidden, hidden),
-            nn.ReLU(),
-            nn.Linear(hidden, action_dim)
-        )
-    def next_action(self, state: T.Tensor) -> Categorical:
-        # Returns the probability distribution over actions
-        if state.dim() == 1:
-            state = state.unsqueeze(0)
-        state = state.view(state.size(0), -1)
-        return Categorical(logits=self.net(state))
-class Critic(nn.Module):
-    def __init__(self, obs_dim: int, hidden: int):
-        super().__init__()
-        self.net = nn.Sequential(
-            nn.Linear(obs_dim, hidden),
-            nn.ReLU(),
-            nn.Linear(hidden, hidden),
-            nn.ReLU(),
-            nn.Linear(hidden, 1)
-        )
-    def evaluated_state(self, x: T.Tensor) -> T.Tensor:
-        if x.dim() == 1:
-            x = x.unsqueeze(0)
-        x = x.view(x.size(0), -1)
-        return self.net(x).squeeze(-1)
-class Memory():
-    def __init__(self):
-        self.states = []
-        self.actions = []
-        self.rewards = []
-        self.dones = []
-        self.log_probs = []
-        self.values = []
-        self.next_values = []
-    def store(self, state, action, reward, done, log_prob, value, next_value):
-        self.states.append(np.asarray(state, dtype=np.float32))
-        self.actions.append(int(action))
-        self.rewards.append(float(reward))
-        self.dones.append(float(done))
-        self.log_probs.append(float(log_prob))
-        self.values.append(float(value))
-        self.next_values.append(float(next_value))
-    """
-    # For mini-batch updates? To be implemented
-    def start_batch(self, batch_size: int):
-        n_states = len(self.states)
-        starts = np.arange(0, n_states, batch_size)
-        index = np.arange(n_states, dtype=np.int64)
-        np.random.shuffle(index)
-        return [index[s:s + batch_size] for s in starts]
-    """
-    def clear(self):
-        self.states = []
-        self.actions = []
-        self.rewards = []
-        self.dones = []
-        self.log_probs = []
-        self.values = []
-        self.next_values = []
-class AdvantageNorm:
-    '''
-    This class implements the Advantage Normalization. The purpose is to normalize either across batches or
-    only within the same batch.
-    '''
-    def __init__(self):
-        self.main_mean = 0
-        self.main_var = 0
-        self.count = 1e-4
-    def update(self, x: T.Tensor):
-        print("I am updating the main mean and main variance")
-        batch_mean = T.mean(x, dim=0)
-        batch_var = T.var(x, dim=0)
-        batch_count = x.shape[0]
-        self._update_from_moments(batch_mean, batch_var, batch_count)
-    def _update_from_moments(self, batch_mean, batch_var, batch_count):
-        delta = batch_mean - self.main_mean
-        tot_count = self.count + batch_count
-        new_mean = self.main_mean + delta * batch_count / tot_count #Update the running mean
-        m_a = self.main_var * self.count
-        m_b = batch_var * batch_count
-        M2 = m_a + m_b + np.square(delta) * self.count * batch_count / tot_count
-        new_var = M2 / tot_count # update the running variance
-        self.main_mean, self.main_var, self.count = new_mean, new_var, tot_count
-    def normalize(self, x):
-        print("I apply normalization on the advantages")
-        return (x - self.main_mean) / (np.sqrt(self.main_var) + 1e-8) # We add epsilon to make sure that we don't
-                                                            # divide through zero.
-class ObservationScaling:
-    def __init__(self):
-        self.main_mean = 0
-        self.main_var = 0
-        self.count = 1e-4
-    def update(self, x: T.Tensor):
-        print("I am updating the main mean and main variance")
-        batch_mean = T.mean(x, dim=0)
-        batch_var = T.var(x, dim=0)
-        batch_count = x.shape[0]
-        self._update_from_moments(batch_mean, batch_var, batch_count)
-    def _update_from_moments(self, batch_mean, batch_var, batch_count):
-        delta = batch_mean - self.main_mean
-        tot_count = self.count + batch_count
-        new_mean = self.main_mean + delta * batch_count / tot_count #Update the running mean
-        m_a = self.main_var * self.count
-        m_b = batch_var * batch_count
-        M2 = m_a + m_b + np.square(delta) * self.count * batch_count / tot_count
-        new_var = M2 / tot_count # update the running variance
-        self.main_mean, self.main_var, self.count = new_mean, new_var, tot_count
-    def normalize(self, x):
-        print("I apply normalization on the observations")
-        return (x - self.main_mean) / (np.sqrt(self.main_var) + 1e-8) # We add epsilon to make sure that we don't
-                                                            # divide through zero.

Observation_Advantage_Norm_diff_combo/ppo__rew_norm_obs_diff_combo.py ADDED Viewed

	@@ -0,0 +1,1254 @@

+import numpy as np
+import torch as T
+import torch.nn as nn
+import torch.optim as optim
+from torch.distributions import Categorical
+class Agent:
+    def __init__(
+            self,
+            obs_space,
+            action_space,
+            hidden,
+            gamma,
+            clip_coef,
+            lr,
+            value_coef,
+            entropy_coef,
+            seed,
+            batch_size,
+            ppo_epochs,
+            lam,
+            update_type
+    ):
+        # Initialize seed for reproducibility
+        if seed is not None:
+            np.random.seed(seed)
+            T.manual_seed(seed)
+        """
+        # For flat observations (MLP model)
+        # Use GPU if available
+        self.device = T.device('cuda:0' if T.cuda.is_available() else 'cpu')
+        self.obs_dim = int(np.prod(getattr(obs_space, "shape", (obs_space,))))
+        self.action_dim = int(getattr(action_space, "n", action_space))
+        # Initialize the policy and the critic networks
+        self.policy = Policy(self.obs_dim, self.action_dim, hidden).to(self.device)
+        self.critic = Critic(self.obs_dim, hidden).to(self.device)
+        """
+        # Use GPU if available
+        self.device = T.device('cuda:0' if T.cuda.is_available() else 'cpu')
+        self.action_dim = int(getattr(action_space, "n", action_space))
+        self.update_type = update_type
+        # Initialize the policy and the critic networks
+        # Pass the shape tuple directly, not the flattened dimension.
+        self.policy = Policy(obs_space.shape, self.action_dim, hidden).to(self.device)
+        self.critic = Critic(obs_space.shape, hidden).to(self.device)
+        self.observeNorm = ObservationNorm()
+        self.advantageNorm = AdvantageNorm()
+        self.returnNorm = ReturnNorm()
+        # Set optimizer for policy and critic networks
+        self.opt = optim.Adam(
+            list(self.policy.parameters()) + list(self.critic.parameters()),
+            lr=lr
+        )
+        self.gamma = gamma
+        self.clip = clip_coef
+        self.value_coef = value_coef
+        self.entropy_coef = entropy_coef
+        self.sigma_history = []
+        self.loss_history = []
+        self.policy_loss_history = []
+        self.value_loss_history = []
+        self.entropy_history = []
+        self.lam = lam
+        self.ppo_epochs = ppo_epochs
+        self.batch_size = batch_size
+        self.memory = Memory()
+    """
+    # Choose action and remember for flat observations (MLP model)
+    def choose_action(self, observation):
+        # Returns: action, log probabilitiy, value of the state
+        state = T.as_tensor(observation, dtype=T.float32, device=self.device).view(-1)
+        with T.no_grad():
+            # Forward function (defined in Policy class)
+            dist = self.policy.next_action(state)
+            action = dist.sample()
+            logp = dist.log_prob(action)
+            value = self.critic.evaluated_state(state)
+        return int(action.item()), float(logp.item()), float(value.item())
+    def remember(self, state, action, reward, done, log_prob, value, next_state):
+        with T.no_grad():
+            # Pass on next state and have it evaluated by the critic network
+            ns = T.as_tensor(next_state, dtype=T.float32, device=self.device).view(-1)
+            next_value = self.critic.evaluated_state(ns).item()
+        self.memory.store(state, action, reward, done, log_prob, value, next_value)
+    """
+    # For CNN model
+    def choose_action(self, observation):
+        # Returns: action, log probabilitiy, value of the state
+        state = T.as_tensor(observation, dtype=T.float32, device=self.device) # Remove .view(-1)
+        with T.no_grad():
+            # Forward function (defined in Policy class)
+            dist = self.policy.next_action(state)
+            action = dist.sample()
+            logp = dist.log_prob(action)
+            value = self.critic.evaluated_state(state)
+        return int(action.item()), float(logp.item()), float(value.item())
+    def remember(self, state, action, reward, done, log_prob, value, next_state):
+        with T.no_grad():
+            # Pass on next state and have it evaluated by the critic network
+            ns = T.as_tensor(next_state, dtype=T.float32, device=self.device) # Remove .view(-1)
+            next_value = self.critic.evaluated_state(ns).item()
+        self.memory.store(state, action, reward, done, log_prob, value, next_value)
+    def _update(self):
+        if self.update_type == "update_all_norm":
+            return self.update_all_norm()
+        elif self.update_type == "update_observation_advantage_norm":
+            return self.update_observation_advantage_norm()
+        elif self.update_type == "update_observation_return_norm":
+            return self.update_observation_return_norm()
+        elif self.update_type == "update_advantage_return_norm":
+            return self.update_advantage_return_norm()
+        else:
+            return self.vanilla_ppo_update()
+    def vanilla_ppo_update(self):
+        if len(self.memory.states) == 0:
+            return 0.0
+        # Convert memory to tensors
+        states = T.as_tensor(np.array(self.memory.states), dtype=T.float32, device=self.device)
+        actions = T.as_tensor(self.memory.actions, dtype=T.long, device=self.device)
+        rewards = T.as_tensor(self.memory.rewards, dtype=T.float32, device=self.device)
+        dones = T.as_tensor(self.memory.dones, dtype=T.float32, device=self.device)
+        old_logp = T.as_tensor(self.memory.log_probs, dtype=T.float32, device=self.device)
+        values = T.as_tensor(self.memory.values, dtype=T.float32, device=self.device)
+        with T.no_grad():
+            # Compute next values (bootstrap for final step)
+            next_values = T.cat([values[1:], values[-1:].clone()])
+            deltas = rewards + self.gamma * next_values * (1 - dones) - values
+            # --- GAE-Lambda ---
+            adv = T.zeros_like(rewards)
+            gae = 0.0
+            for t in reversed(range(len(rewards))):
+                gae = deltas[t] + self.gamma * self.lam * (1 - dones[t]) * gae
+                adv[t] = gae
+            returns = adv + values
+            # Advantage normalization
+            adv = (adv - adv.mean()) / (adv.std(unbiased=False) + 1e-8)
+        # --- PPO Multiple Epochs + Minibatch ---
+        total_loss_epoch = 0.0
+        num_samples = len(states)
+        batch_size = min(64, num_samples)
+        ppo_epochs = 4
+        for _ in range(ppo_epochs):
+            # Shuffle indices
+            idxs = T.randperm(num_samples)
+            for start in range(0, num_samples, batch_size):
+                batch_idx = idxs[start:start + batch_size]
+                b_states = states[batch_idx]
+                b_actions = actions[batch_idx]
+                b_old_logp = old_logp[batch_idx]
+                b_returns = returns[batch_idx]
+                b_adv = adv[batch_idx]
+                dist = self.policy.next_action(b_states)
+                new_logp = dist.log_prob(b_actions)
+                entropy = dist.entropy().mean()
+                ratio = (new_logp - b_old_logp).exp()
+                # --- Clipped surrogate objective ---
+                surr1 = ratio * b_adv
+                surr2 = T.clamp(ratio, 1 - self.clip, 1 + self.clip) * b_adv
+                policy_loss = -T.min(surr1, surr2).mean()
+                # --- Critic loss ---
+                value_pred = self.critic.evaluated_state(b_states)
+                value_loss = 0.5 * (b_returns - value_pred).pow(2).mean()
+                # --- Total loss ---
+                total_loss = (
+                        policy_loss +
+                        self.value_coef * value_loss -
+                        self.entropy_coef * entropy
+                )
+                # Debug: track individual loss components
+                self.policy_loss_history.append(policy_loss.item())
+                self.value_loss_history.append(value_loss.item())
+                self.opt.zero_grad(set_to_none=True)
+                total_loss.backward()
+                self.opt.step()
+                total_loss_epoch += total_loss.item()
+        # Clear memory after full PPO update
+        self.memory.clear()
+        return total_loss_epoch / (ppo_epochs * (num_samples / batch_size))
+    def update_rbs(self):
+        if len(self.memory.states) == 0:
+            return 0.0
+        # Convert memory to tensors
+        states = T.as_tensor(np.array(self.memory.states), dtype=T.float32, device=self.device)
+        actions = T.as_tensor(self.memory.actions, dtype=T.long, device=self.device)
+        rewards = T.as_tensor(self.memory.rewards, dtype=T.float32, device=self.device)
+        dones = T.as_tensor(self.memory.dones, dtype=T.float32, device=self.device)
+        old_logp = T.as_tensor(self.memory.log_probs, dtype=T.float32, device=self.device)
+        values = T.as_tensor(self.memory.values, dtype=T.float32, device=self.device)
+        with T.no_grad():
+            # Compute next values (bootstrap for final step)
+            next_values = T.cat([values[1:], values[-1:].clone()])
+            deltas = rewards + self.gamma * next_values * (1 - dones) - values
+            # --- GAE-Lambda ---
+            adv = T.zeros_like(rewards)
+            gae = 0.0
+            for t in reversed(range(len(rewards))):
+                gae = deltas[t] + self.gamma * self.lam * (1 - dones[t]) * gae
+                adv[t] = gae
+            returns = adv + values
+            # --- Return-based normalization (RBS) ---
+            sigma_t = returns.std(unbiased=False) + 1e-8
+            returns = returns / sigma_t
+            self.sigma_history.append(sigma_t.item())
+            adv = adv / sigma_t
+            # Advantage normalization
+            adv = (adv - adv.mean()) / (adv.std(unbiased=False) + 1e-8)
+        # --- PPO Multiple Epochs + Minibatch ---
+        total_loss_epoch = 0.0
+        num_samples = len(states)
+        batch_size = min(64, num_samples)
+        ppo_epochs = 4
+        for _ in range(ppo_epochs):
+            # Shuffle indices
+            idxs = T.randperm(num_samples)
+            for start in range(0, num_samples, batch_size):
+                batch_idx = idxs[start:start + batch_size]
+                b_states = states[batch_idx]
+                b_actions = actions[batch_idx]
+                b_old_logp = old_logp[batch_idx]
+                b_returns = returns[batch_idx]
+                b_adv = adv[batch_idx]
+                dist = self.policy.next_action(b_states)
+                new_logp = dist.log_prob(b_actions)
+                entropy = dist.entropy().mean()
+                ratio = (new_logp - b_old_logp).exp()
+                # --- Clipped surrogate objective ---
+                surr1 = ratio * b_adv
+                surr2 = T.clamp(ratio, 1 - self.clip, 1 + self.clip) * b_adv
+                policy_loss = -T.min(surr1, surr2).mean()
+                # --- Critic loss ---
+                value_pred = self.critic.evaluated_state(b_states)
+                value_loss = 0.5 * (b_returns - value_pred).pow(2).mean()
+                # --- Total loss ---
+                total_loss = (
+                        policy_loss +
+                        self.value_coef * value_loss -
+                        self.entropy_coef * entropy
+                )
+                # Debug: track individual loss components
+                self.policy_loss_history.append(policy_loss.item())
+                self.value_loss_history.append(value_loss.item())
+                self.opt.zero_grad(set_to_none=True)
+                total_loss.backward()
+                self.opt.step()
+                total_loss_epoch += total_loss.item()
+        # Clear memory after full PPO update
+        self.memory.clear()
+        return total_loss_epoch / (ppo_epochs * (num_samples / batch_size))
+    '''
+    Different combination of normalization techniques combined to test if the performance gets better.
+    '''
+    def update_all_norm(self):
+        if len(self.memory.states) == 0:
+            return 0.0
+        # Convert memory to tensors
+        states = T.as_tensor(np.array(self.memory.states), dtype=T.float32, device=self.device)
+        actions = T.as_tensor(self.memory.actions, dtype=T.long, device=self.device)
+        rewards = T.as_tensor(self.memory.rewards, dtype=T.float32, device=self.device)
+        dones = T.as_tensor(self.memory.dones, dtype=T.float32, device=self.device)
+        old_logp = T.as_tensor(self.memory.log_probs, dtype=T.float32, device=self.device)
+        values = T.as_tensor(self.memory.values, dtype=T.float32, device=self.device)
+        with T.no_grad():
+            # Compute next values (bootstrap for final step)
+            next_values = T.cat([values[1:], values[-1:].clone()])
+            deltas = rewards + self.gamma * next_values * (1 - dones) - values
+            # --- GAE-Lambda ---
+            adv = T.zeros_like(rewards)
+            gae = 0.0
+            for t in reversed(range(len(rewards))):
+                gae = deltas[t] + self.gamma * self.lam * (1 - dones[t]) * gae
+                adv[t] = gae
+            # Advantage normalization
+            self.advantageNorm.update(adv)
+            adv = self.advantageNorm.normalize(adv)
+            returns = adv + values
+            # --- returns normalization ---
+            self.returnNorm.update(returns)
+            returns = self.returnNorm.normalize(returns)
+            # --- observation normalization ---
+            self.observeNorm.update(states)
+            states = self.observeNorm.normalize(states)
+        # --- PPO Multiple Epochs + Minibatch ---
+        total_loss_epoch = 0.0
+        num_samples = len(states)
+        batch_size = min(64, num_samples)
+        ppo_epochs = 4
+        for _ in range(ppo_epochs):
+            # Shuffle indices
+            idxs = T.randperm(num_samples)
+            for start in range(0, num_samples, batch_size):
+                batch_idx = idxs[start:start + batch_size]
+                b_states = states[batch_idx]
+                b_actions = actions[batch_idx]
+                b_old_logp = old_logp[batch_idx]
+                b_returns = returns[batch_idx]
+                b_adv = adv[batch_idx]
+                dist = self.policy.next_action(b_states)
+                new_logp = dist.log_prob(b_actions)
+                entropy = dist.entropy().mean()
+                ratio = (new_logp - b_old_logp).exp()
+                # --- Clipped surrogate objective ---
+                surr1 = ratio * b_adv
+                surr2 = T.clamp(ratio, 1 - self.clip, 1 + self.clip) * b_adv
+                policy_loss = -T.min(surr1, surr2).mean()
+                # --- Critic loss ---
+                value_pred = self.critic.evaluated_state(b_states)
+                value_loss = 0.5 * (b_returns - value_pred).pow(2).mean()
+                # --- Total loss ---
+                total_loss = (
+                        policy_loss +
+                        self.value_coef * value_loss -
+                        self.entropy_coef * entropy
+                )
+                # Debug: track individual loss components
+                self.policy_loss_history.append(policy_loss.item())
+                self.value_loss_history.append(value_loss.item())
+                self.opt.zero_grad(set_to_none=True)
+                total_loss.backward()
+                self.opt.step()
+                total_loss_epoch += total_loss.item()
+        # Clear memory after full PPO update
+        self.memory.clear()
+        return total_loss_epoch / (ppo_epochs * (num_samples / batch_size))
+    def update_observation_advantage_norm(self):
+        if len(self.memory.states) == 0:
+            return 0.0
+        # Convert memory to tensors
+        states = T.as_tensor(np.array(self.memory.states), dtype=T.float32, device=self.device)
+        actions = T.as_tensor(self.memory.actions, dtype=T.long, device=self.device)
+        rewards = T.as_tensor(self.memory.rewards, dtype=T.float32, device=self.device)
+        dones = T.as_tensor(self.memory.dones, dtype=T.float32, device=self.device)
+        old_logp = T.as_tensor(self.memory.log_probs, dtype=T.float32, device=self.device)
+        values = T.as_tensor(self.memory.values, dtype=T.float32, device=self.device)
+        with T.no_grad():
+            # Compute next values (bootstrap for final step)
+            next_values = T.cat([values[1:], values[-1:].clone()])
+            deltas = rewards + self.gamma * next_values * (1 - dones) - values
+            # --- GAE-Lambda ---
+            adv = T.zeros_like(rewards)
+            gae = 0.0
+            for t in reversed(range(len(rewards))):
+                gae = deltas[t] + self.gamma * self.lam * (1 - dones[t]) * gae
+                adv[t] = gae
+            # Advantage normalization
+            self.advantageNorm.update(adv)
+            adv = self.advantageNorm.normalize(adv)
+            returns = adv + values
+            # --- observation normalization ---
+            self.observeNorm.update(states)
+            states = self.observeNorm.normalize(states)
+        # --- PPO Multiple Epochs + Minibatch ---
+        total_loss_epoch = 0.0
+        num_samples = len(states)
+        batch_size = min(64, num_samples)
+        ppo_epochs = 4
+        for _ in range(ppo_epochs):
+            # Shuffle indices
+            idxs = T.randperm(num_samples)
+            for start in range(0, num_samples, batch_size):
+                batch_idx = idxs[start:start + batch_size]
+                b_states = states[batch_idx]
+                b_actions = actions[batch_idx]
+                b_old_logp = old_logp[batch_idx]
+                b_returns = returns[batch_idx]
+                b_adv = adv[batch_idx]
+                dist = self.policy.next_action(b_states)
+                new_logp = dist.log_prob(b_actions)
+                entropy = dist.entropy().mean()
+                ratio = (new_logp - b_old_logp).exp()
+                # --- Clipped surrogate objective ---
+                surr1 = ratio * b_adv
+                surr2 = T.clamp(ratio, 1 - self.clip, 1 + self.clip) * b_adv
+                policy_loss = -T.min(surr1, surr2).mean()
+                # --- Critic loss ---
+                value_pred = self.critic.evaluated_state(b_states)
+                value_loss = 0.5 * (b_returns - value_pred).pow(2).mean()
+                # --- Total loss ---
+                total_loss = (
+                        policy_loss +
+                        self.value_coef * value_loss -
+                        self.entropy_coef * entropy
+                )
+                # Debug: track individual loss components
+                self.policy_loss_history.append(policy_loss.item())
+                self.value_loss_history.append(value_loss.item())
+                self.opt.zero_grad(set_to_none=True)
+                total_loss.backward()
+                self.opt.step()
+                total_loss_epoch += total_loss.item()
+        # Clear memory after full PPO update
+        self.memory.clear()
+        return total_loss_epoch / (ppo_epochs * (num_samples / batch_size))
+    def update_observation_return_norm(self):
+        if len(self.memory.states) == 0:
+            return 0.0
+        # Convert memory to tensors
+        states = T.as_tensor(np.array(self.memory.states), dtype=T.float32, device=self.device)
+        actions = T.as_tensor(self.memory.actions, dtype=T.long, device=self.device)
+        rewards = T.as_tensor(self.memory.rewards, dtype=T.float32, device=self.device)
+        dones = T.as_tensor(self.memory.dones, dtype=T.float32, device=self.device)
+        old_logp = T.as_tensor(self.memory.log_probs, dtype=T.float32, device=self.device)
+        values = T.as_tensor(self.memory.values, dtype=T.float32, device=self.device)
+        with T.no_grad():
+            # Compute next values (bootstrap for final step)
+            next_values = T.cat([values[1:], values[-1:].clone()])
+            deltas = rewards + self.gamma * next_values * (1 - dones) - values
+            # --- GAE-Lambda ---
+            adv = T.zeros_like(rewards)
+            gae = 0.0
+            for t in reversed(range(len(rewards))):
+                gae = deltas[t] + self.gamma * self.lam * (1 - dones[t]) * gae
+                adv[t] = gae
+            returns = adv + values
+            # --- returns normalization ---
+            self.returnNorm.update(returns)
+            returns = self.returnNorm.normalize(returns)
+            # --- observation normalization ---
+            self.observeNorm.update(states)
+            states = self.observeNorm.normalize(states)
+        # --- PPO Multiple Epochs + Minibatch ---
+        total_loss_epoch = 0.0
+        num_samples = len(states)
+        batch_size = min(64, num_samples)
+        ppo_epochs = 4
+        for _ in range(ppo_epochs):
+            # Shuffle indices
+            idxs = T.randperm(num_samples)
+            for start in range(0, num_samples, batch_size):
+                batch_idx = idxs[start:start + batch_size]
+                b_states = states[batch_idx]
+                b_actions = actions[batch_idx]
+                b_old_logp = old_logp[batch_idx]
+                b_returns = returns[batch_idx]
+                b_adv = adv[batch_idx]
+                dist = self.policy.next_action(b_states)
+                new_logp = dist.log_prob(b_actions)
+                entropy = dist.entropy().mean()
+                ratio = (new_logp - b_old_logp).exp()
+                # --- Clipped surrogate objective ---
+                surr1 = ratio * b_adv
+                surr2 = T.clamp(ratio, 1 - self.clip, 1 + self.clip) * b_adv
+                policy_loss = -T.min(surr1, surr2).mean()
+                # --- Critic loss ---
+                value_pred = self.critic.evaluated_state(b_states)
+                value_loss = 0.5 * (b_returns - value_pred).pow(2).mean()
+                # --- Total loss ---
+                total_loss = (
+                        policy_loss +
+                        self.value_coef * value_loss -
+                        self.entropy_coef * entropy
+                )
+                # Debug: track individual loss components
+                self.policy_loss_history.append(policy_loss.item())
+                self.value_loss_history.append(value_loss.item())
+                self.opt.zero_grad(set_to_none=True)
+                total_loss.backward()
+                self.opt.step()
+                total_loss_epoch += total_loss.item()
+        # Clear memory after full PPO update
+        self.memory.clear()
+        return total_loss_epoch / (ppo_epochs * (num_samples / batch_size))
+    def update_advantage_return_norm(self):
+        if len(self.memory.states) == 0:
+            return 0.0
+        # Convert memory to tensors
+        states = T.as_tensor(np.array(self.memory.states), dtype=T.float32, device=self.device)
+        actions = T.as_tensor(self.memory.actions, dtype=T.long, device=self.device)
+        rewards = T.as_tensor(self.memory.rewards, dtype=T.float32, device=self.device)
+        dones = T.as_tensor(self.memory.dones, dtype=T.float32, device=self.device)
+        old_logp = T.as_tensor(self.memory.log_probs, dtype=T.float32, device=self.device)
+        values = T.as_tensor(self.memory.values, dtype=T.float32, device=self.device)
+        with T.no_grad():
+            # Compute next values (bootstrap for final step)
+            next_values = T.cat([values[1:], values[-1:].clone()])
+            deltas = rewards + self.gamma * next_values * (1 - dones) - values
+            # --- GAE-Lambda ---
+            adv = T.zeros_like(rewards)
+            gae = 0.0
+            for t in reversed(range(len(rewards))):
+                gae = deltas[t] + self.gamma * self.lam * (1 - dones[t]) * gae
+                adv[t] = gae
+            # Advantage normalization
+            self.advantageNorm.update(adv)
+            adv = self.advantageNorm.normalize(adv)
+            returns = adv + values
+            # --- returns normalization ---
+            self.returnNorm.update(returns)
+            returns = self.returnNorm.normalize(returns)
+        # --- PPO Multiple Epochs + Minibatch ---
+        total_loss_epoch = 0.0
+        num_samples = len(states)
+        batch_size = min(64, num_samples)
+        ppo_epochs = 4
+        for _ in range(ppo_epochs):
+            # Shuffle indices
+            idxs = T.randperm(num_samples)
+            for start in range(0, num_samples, batch_size):
+                batch_idx = idxs[start:start + batch_size]
+                b_states = states[batch_idx]
+                b_actions = actions[batch_idx]
+                b_old_logp = old_logp[batch_idx]
+                b_returns = returns[batch_idx]
+                b_adv = adv[batch_idx]
+                dist = self.policy.next_action(b_states)
+                new_logp = dist.log_prob(b_actions)
+                entropy = dist.entropy().mean()
+                ratio = (new_logp - b_old_logp).exp()
+                # --- Clipped surrogate objective ---
+                surr1 = ratio * b_adv
+                surr2 = T.clamp(ratio, 1 - self.clip, 1 + self.clip) * b_adv
+                policy_loss = -T.min(surr1, surr2).mean()
+                # --- Critic loss ---
+                value_pred = self.critic.evaluated_state(b_states)
+                value_loss = 0.5 * (b_returns - value_pred).pow(2).mean()
+                # --- Total loss ---
+                total_loss = (
+                        policy_loss +
+                        self.value_coef * value_loss -
+                        self.entropy_coef * entropy
+                )
+                # Debug: track individual loss components
+                self.policy_loss_history.append(policy_loss.item())
+                self.value_loss_history.append(value_loss.item())
+                self.opt.zero_grad(set_to_none=True)
+                total_loss.backward()
+                self.opt.step()
+                total_loss_epoch += total_loss.item()
+        # Clear memory after full PPO update
+        self.memory.clear()
+        return total_loss_epoch / (ppo_epochs * (num_samples / batch_size))
+#------------------------------------------#
+    def update_observation_norm(self):
+        if len(self.memory.states) == 0:
+            return 0.0
+        # Convert memory to tensors
+        states = T.as_tensor(np.array(self.memory.states), dtype=T.float32, device=self.device)
+        actions = T.as_tensor(self.memory.actions, dtype=T.long, device=self.device)
+        rewards = T.as_tensor(self.memory.rewards, dtype=T.float32, device=self.device)
+        dones = T.as_tensor(self.memory.dones, dtype=T.float32, device=self.device)
+        old_logp = T.as_tensor(self.memory.log_probs, dtype=T.float32, device=self.device)
+        values = T.as_tensor(self.memory.values, dtype=T.float32, device=self.device)
+        with T.no_grad():
+            # Compute next values (bootstrap for final step)
+            next_values = T.cat([values[1:], values[-1:].clone()])
+            deltas = rewards + self.gamma * next_values * (1 - dones) - values
+            # --- GAE-Lambda ---
+            adv = T.zeros_like(rewards)
+            gae = 0.0
+            for t in reversed(range(len(rewards))):
+                gae = deltas[t] + self.gamma * self.lam * (1 - dones[t]) * gae
+                adv[t] = gae
+            returns = adv + values
+            # --- observation normalization ---
+            self.observeNorm.update(states)
+            states = self.observeNorm.normalize(states)
+            # Advantage normalization
+            adv = (adv - adv.mean()) / (adv.std(unbiased=False) + 1e-8)
+        # --- PPO Multiple Epochs + Minibatch ---
+        total_loss_epoch = 0.0
+        num_samples = len(states)
+        batch_size = min(64, num_samples)
+        ppo_epochs = 4
+        for _ in range(ppo_epochs):
+            # Shuffle indices
+            idxs = T.randperm(num_samples)
+            for start in range(0, num_samples, batch_size):
+                batch_idx = idxs[start:start + batch_size]
+                b_states = states[batch_idx]
+                b_actions = actions[batch_idx]
+                b_old_logp = old_logp[batch_idx]
+                b_returns = returns[batch_idx]
+                b_adv = adv[batch_idx]
+                dist = self.policy.next_action(b_states)
+                new_logp = dist.log_prob(b_actions)
+                entropy = dist.entropy().mean()
+                ratio = (new_logp - b_old_logp).exp()
+                # --- Clipped surrogate objective ---
+                surr1 = ratio * b_adv
+                surr2 = T.clamp(ratio, 1 - self.clip, 1 + self.clip) * b_adv
+                policy_loss = -T.min(surr1, surr2).mean()
+                # --- Critic loss ---
+                value_pred = self.critic.evaluated_state(b_states)
+                value_loss = 0.5 * (b_returns - value_pred).pow(2).mean()
+                # --- Total loss ---
+                total_loss = (
+                        policy_loss +
+                        self.value_coef * value_loss -
+                        self.entropy_coef * entropy
+                )
+                # Debug: track individual loss components
+                self.policy_loss_history.append(policy_loss.item())
+                self.value_loss_history.append(value_loss.item())
+                self.opt.zero_grad(set_to_none=True)
+                total_loss.backward()
+                self.opt.step()
+                total_loss_epoch += total_loss.item()
+        # Clear memory after full PPO update
+        self.memory.clear()
+        return total_loss_epoch / (ppo_epochs * (num_samples / batch_size))
+    def update_advantage_norm(self):
+        if len(self.memory.states) == 0:
+            return 0.0
+        # Convert memory to tensors
+        states = T.as_tensor(np.array(self.memory.states), dtype=T.float32, device=self.device)
+        actions = T.as_tensor(self.memory.actions, dtype=T.long, device=self.device)
+        rewards = T.as_tensor(self.memory.rewards, dtype=T.float32, device=self.device)
+        dones = T.as_tensor(self.memory.dones, dtype=T.float32, device=self.device)
+        old_logp = T.as_tensor(self.memory.log_probs, dtype=T.float32, device=self.device)
+        values = T.as_tensor(self.memory.values, dtype=T.float32, device=self.device)
+        with T.no_grad():
+            # Compute next values (bootstrap for final step)
+            next_values = T.cat([values[1:], values[-1:].clone()])
+            deltas = rewards + self.gamma * next_values * (1 - dones) - values
+            # --- GAE-Lambda ---
+            adv = T.zeros_like(rewards)
+            gae = 0.0
+            for t in reversed(range(len(rewards))):
+                gae = deltas[t] + self.gamma * self.lam * (1 - dones[t]) * gae
+                adv[t] = gae
+            # --- Advantage normalization ---
+            self.advantageNorm.update(adv)
+            adv = self.observeNorm.normalize(adv)
+            returns = adv + values
+        # --- PPO Multiple Epochs + Minibatch ---
+        total_loss_epoch = 0.0
+        num_samples = len(states)
+        batch_size = min(64, num_samples)
+        ppo_epochs = 4
+        for _ in range(ppo_epochs):
+            # Shuffle indices
+            idxs = T.randperm(num_samples)
+            for start in range(0, num_samples, batch_size):
+                batch_idx = idxs[start:start + batch_size]
+                b_states = states[batch_idx]
+                b_actions = actions[batch_idx]
+                b_old_logp = old_logp[batch_idx]
+                b_returns = returns[batch_idx]
+                b_adv = adv[batch_idx]
+                dist = self.policy.next_action(b_states)
+                new_logp = dist.log_prob(b_actions)
+                entropy = dist.entropy().mean()
+                ratio = (new_logp - b_old_logp).exp()
+                # --- Clipped surrogate objective ---
+                surr1 = ratio * b_adv
+                surr2 = T.clamp(ratio, 1 - self.clip, 1 + self.clip) * b_adv
+                policy_loss = -T.min(surr1, surr2).mean()
+                # --- Critic loss ---
+                value_pred = self.critic.evaluated_state(b_states)
+                value_loss = 0.5 * (b_returns - value_pred).pow(2).mean()
+                # --- Total loss ---
+                total_loss = (
+                        policy_loss +
+                        self.value_coef * value_loss -
+                        self.entropy_coef * entropy
+                )
+                # Debug: track individual loss components
+                self.policy_loss_history.append(policy_loss.item())
+                self.value_loss_history.append(value_loss.item())
+                self.opt.zero_grad(set_to_none=True)
+                total_loss.backward()
+                self.opt.step()
+                total_loss_epoch += total_loss.item()
+        # Clear memory after full PPO update
+        self.memory.clear()
+        return total_loss_epoch / (ppo_epochs * (num_samples / batch_size))
+    def update_return_norm(self):
+        if len(self.memory.states) == 0:
+            return 0.0
+        # Convert memory to tensors
+        states = T.as_tensor(np.array(self.memory.states), dtype=T.float32, device=self.device)
+        actions = T.as_tensor(self.memory.actions, dtype=T.long, device=self.device)
+        rewards = T.as_tensor(self.memory.rewards, dtype=T.float32, device=self.device)
+        dones = T.as_tensor(self.memory.dones, dtype=T.float32, device=self.device)
+        old_logp = T.as_tensor(self.memory.log_probs, dtype=T.float32, device=self.device)
+        values = T.as_tensor(self.memory.values, dtype=T.float32, device=self.device)
+        with T.no_grad():
+            # Compute next values (bootstrap for final step)
+            next_values = T.cat([values[1:], values[-1:].clone()])
+            deltas = rewards + self.gamma * next_values * (1 - dones) - values
+            # --- GAE-Lambda ---
+            adv = T.zeros_like(rewards)
+            gae = 0.0
+            for t in reversed(range(len(rewards))):
+                gae = deltas[t] + self.gamma * self.lam * (1 - dones[t]) * gae
+                adv[t] = gae
+            returns = adv + values
+            # --- returns normalization ---
+            self.returnNorm.update(returns)
+            returns = self.returnNorm.normalize(returns)
+            # Advantage normalization
+            adv = (adv - adv.mean()) / (adv.std(unbiased=False) + 1e-8)
+        # --- PPO Multiple Epochs + Minibatch ---
+        total_loss_epoch = 0.0
+        num_samples = len(states)
+        batch_size = min(64, num_samples)
+        ppo_epochs = 4
+        for _ in range(ppo_epochs):
+            # Shuffle indices
+            idxs = T.randperm(num_samples)
+            for start in range(0, num_samples, batch_size):
+                batch_idx = idxs[start:start + batch_size]
+                b_states = states[batch_idx]
+                b_actions = actions[batch_idx]
+                b_old_logp = old_logp[batch_idx]
+                b_returns = returns[batch_idx]
+                b_adv = adv[batch_idx]
+                dist = self.policy.next_action(b_states)
+                new_logp = dist.log_prob(b_actions)
+                entropy = dist.entropy().mean()
+                ratio = (new_logp - b_old_logp).exp()
+                # --- Clipped surrogate objective ---
+                surr1 = ratio * b_adv
+                surr2 = T.clamp(ratio, 1 - self.clip, 1 + self.clip) * b_adv
+                policy_loss = -T.min(surr1, surr2).mean()
+                # --- Critic loss ---
+                value_pred = self.critic.evaluated_state(b_states)
+                value_loss = 0.5 * (b_returns - value_pred).pow(2).mean()
+                # --- Total loss ---
+                total_loss = (
+                        policy_loss +
+                        self.value_coef * value_loss -
+                        self.entropy_coef * entropy
+                )
+                # Debug: track individual loss components
+                self.policy_loss_history.append(policy_loss.item())
+                self.value_loss_history.append(value_loss.item())
+                self.opt.zero_grad(set_to_none=True)
+                total_loss.backward()
+                self.opt.step()
+                total_loss_epoch += total_loss.item()
+        # Clear memory after full PPO update
+        self.memory.clear()
+        return total_loss_epoch / (ppo_epochs * (num_samples / batch_size))
+    def update_reward_gradient_clipping(self):
+        if len(self.memory.states) == 0:
+            return 0.0
+        # Convert memory to tensors
+        states = T.as_tensor(np.array(self.memory.states), dtype=T.float32, device=self.device)
+        actions = T.as_tensor(self.memory.actions, dtype=T.long, device=self.device)
+        rewards = T.as_tensor(self.memory.rewards, dtype=T.float32, device=self.device)
+        dones = T.as_tensor(self.memory.dones, dtype=T.float32, device=self.device)
+        old_logp = T.as_tensor(self.memory.log_probs, dtype=T.float32, device=self.device)
+        values = T.as_tensor(self.memory.values, dtype=T.float32, device=self.device)
+        # Reward clipping
+        rewards = T.clamp(rewards, -1, 1)
+        with T.no_grad():
+            # Compute next values (bootstrap for final step)
+            next_values = T.cat([values[1:], values[-1:].clone()])
+            deltas = rewards + self.gamma * next_values * (1 - dones) - values
+            # --- GAE-Lambda ---
+            adv = T.zeros_like(rewards)
+            gae = 0.0
+            for t in reversed(range(len(rewards))):
+                gae = deltas[t] + self.gamma * self.lam * (1 - dones[t]) * gae
+                adv[t] = gae
+            returns = adv + values
+            # Advantage normalization
+            adv = (adv - adv.mean()) / (adv.std(unbiased=False) + 1e-8)
+        # --- PPO Multiple Epochs + Minibatch ---
+        total_loss_epoch = 0.0
+        num_samples = len(states)
+        batch_size = min(64, num_samples)
+        ppo_epochs = 4
+        for _ in range(ppo_epochs):
+            # Shuffle indices
+            idxs = T.randperm(num_samples)
+            for start in range(0, num_samples, batch_size):
+                batch_idx = idxs[start:start + batch_size]
+                b_states = states[batch_idx]
+                b_actions = actions[batch_idx]
+                b_old_logp = old_logp[batch_idx]
+                b_returns = returns[batch_idx]
+                b_adv = adv[batch_idx]
+                dist = self.policy.next_action(b_states)
+                new_logp = dist.log_prob(b_actions)
+                entropy = dist.entropy().mean()
+                ratio = (new_logp - b_old_logp).exp()
+                # --- Clipped surrogate objective ---
+                surr1 = ratio * b_adv
+                surr2 = T.clamp(ratio, 1 - self.clip, 1 + self.clip) * b_adv
+                policy_loss = -T.min(surr1, surr2).mean()
+                # --- Critic loss ---
+                value_pred = self.critic.evaluated_state(b_states)
+                value_loss = 0.5 * (b_returns - value_pred).pow(2).mean()
+                # --- Total loss ---
+                total_loss = (
+                        policy_loss +
+                        self.value_coef * value_loss -
+                        self.entropy_coef * entropy
+                )
+                # Debug: track individual loss components
+                self.policy_loss_history.append(policy_loss.item())
+                self.value_loss_history.append(value_loss.item())
+                self.opt.zero_grad(set_to_none=True)
+                total_loss.backward()
+                T.nn.utils.clip_grad_norm_(list(self.policy.parameters()) + list(self.critic.parameters()), 0.5)
+                self.opt.step()
+                total_loss_epoch += total_loss.item()
+        # Clear memory after full PPO update
+        self.memory.clear()
+        return total_loss_epoch / (ppo_epochs * (num_samples / batch_size))
+"""
+# Policy network (simple MLP, flattened observations)
+class Policy(nn.Module):
+    def __init__(self, obs_dim: int, action_dim: int, hidden: int):
+        super().__init__()
+        self.net = nn.Sequential(
+            nn.Linear(obs_dim, hidden),
+            nn.ReLU(),
+            nn.Linear(hidden, hidden),
+            nn.ReLU(),
+            nn.Linear(hidden, action_dim)
+        )
+    def next_action(self, state: T.Tensor) -> Categorical:
+        # Returns the probability distribution over actions
+        if state.dim() == 1:
+            state = state.unsqueeze(0)
+        state = state.view(state.size(0), -1)
+        return Categorical(logits=self.net(state))
+"""
+# Policy network (CNN)
+class Policy(nn.Module):
+    def __init__(self, obs_shape: tuple, action_dim: int, hidden: int):
+        super().__init__()
+        c, h, w = obs_shape
+        # Suggested architecture for Atari: https://arxiv.org/pdf/1312.5602
+        self.cnn = nn.Sequential(
+            nn.Conv2d(c, 16, kernel_size=8, stride=4),
+            nn.ReLU(),
+            nn.Conv2d(16, 32, kernel_size=4, stride=2),
+            nn.ReLU(),
+            nn.Flatten()
+        )
+        with T.no_grad():
+            cnn_output_dim = self.cnn(T.zeros(1, c, h, w)).shape[1]
+        self.net = nn.Sequential(
+            nn.Linear(cnn_output_dim, hidden),
+            nn.ReLU(),
+            nn.Linear(hidden, action_dim)
+        )
+    def next_action(self, state: T.Tensor) -> Categorical:
+        # Returns the probability distribution over actions
+        if state.dim() == 3:
+            state = state.unsqueeze(0)
+        cnn_out = self.cnn(state)
+        return Categorical(logits=self.net(cnn_out))
+"""
+# Critic network (simple MLP, flattened observations)
+class Critic(nn.Module):
+    def __init__(self, obs_dim: int, hidden: int):
+        super().__init__()
+        self.net = nn.Sequential(
+            nn.Linear(obs_dim, hidden),
+            nn.ReLU(),
+            nn.Linear(hidden, hidden),
+            nn.ReLU(),
+            nn.Linear(hidden, 1)
+        )
+    def evaluated_state(self, x: T.Tensor) -> T.Tensor:
+        if x.dim() == 1:
+            x = x.unsqueeze(0)
+        x = x.view(x.size(0), -1)
+        return self.net(x).squeeze(-1)
+"""
+# Critic network (CNN)
+class Critic(nn.Module):
+    def __init__(self, obs_shape: tuple, hidden: int):
+        super().__init__()
+        c, h, w = obs_shape
+        # Suggested architecture for Atari: https://arxiv.org/pdf/1312.5602
+        self.cnn = nn.Sequential(
+            nn.Conv2d(c, 16, kernel_size=8, stride=4),
+            nn.ReLU(),
+            nn.Conv2d(16, 32, kernel_size=4, stride=2),
+            nn.ReLU(),
+            nn.Flatten()
+        )
+        with T.no_grad():
+            cnn_output_dim = self.cnn(T.zeros(1, c, h, w)).shape[1]
+        self.net = nn.Sequential(
+            nn.Linear(cnn_output_dim, hidden),
+            nn.ReLU(),
+            nn.Linear(hidden, 1)
+        )
+    def evaluated_state(self, x: T.Tensor) -> T.Tensor:
+        if x.dim() == 3:
+            x = x.unsqueeze(0)
+        cnn_out = self.cnn(x)
+        return self.net(cnn_out).squeeze(-1)
+class Memory():
+    def __init__(self):
+        self.states = []
+        self.actions = []
+        self.rewards = []
+        self.dones = []
+        self.log_probs = []
+        self.values = []
+        self.next_values = []
+    def store(self, state, action, reward, done, log_prob, value, next_value):
+        self.states.append(np.asarray(state, dtype=np.float32))
+        self.actions.append(int(action))
+        self.rewards.append(float(reward))
+        self.dones.append(float(done))
+        self.log_probs.append(float(log_prob))
+        self.values.append(float(value))
+        self.next_values.append(float(next_value))
+    """
+    # For mini-batch updates? To be implemented
+    def start_batch(self, batch_size: int):
+        n_states = len(self.states)
+        starts = np.arange(0, n_states, batch_size)
+        index = np.arange(n_states, dtype=np.int64)
+        np.random.shuffle(index)
+        return [index[s:s + batch_size] for s in starts]
+    """
+    def clear(self):
+        self.states = []
+        self.actions = []
+        self.rewards = []
+        self.dones = []
+        self.log_probs = []
+        self.values = []
+        self.next_values = []
+class ObservationNorm:
+    def __init__(self):
+        self.main_mean = 0
+        self.main_var = 0
+        self.count = 1e-4
+    def update(self, x: T.Tensor):
+        batch_mean = T.mean(x, dim=0)
+        batch_var = T.var(x, dim=0)
+        batch_count = x.shape[0]
+        self._update_from_moments(batch_mean, batch_var, batch_count)
+    def _update_from_moments(self, batch_mean, batch_var, batch_count):
+        delta = batch_mean - self.main_mean
+        tot_count = self.count + batch_count
+        new_mean = self.main_mean + delta * batch_count / tot_count #Update the running mean
+        m_a = self.main_var * self.count
+        m_b = batch_var * batch_count
+        M2 = m_a + m_b + np.square(delta) * self.count * batch_count / tot_count
+        new_var = M2 / tot_count # update the running variance
+        self.main_mean, self.main_var, self.count = new_mean, new_var, tot_count
+    def normalize(self, x):
+        return (x - self.main_mean) / (np.sqrt(self.main_var) + 1e-8) # We add epsilon to make sure that we don't
+                                                            # divide through zero.
+class AdvantageNorm:
+    '''
+    This class implements the Advantage Normalization. The purpose is to normalize either across batches or
+    only within the same batch.
+    '''
+    def __init__(self):
+        self.main_mean = 0
+        self.main_var = 0
+        self.count = 1e-4
+    def update(self, x: T.Tensor):
+        batch_mean = T.mean(x, dim=0)
+        batch_var = T.var(x, dim=0)
+        batch_count = x.shape[0]
+        self._update_from_moments(batch_mean, batch_var, batch_count)
+    def _update_from_moments(self, batch_mean, batch_var, batch_count):
+        delta = batch_mean - self.main_mean
+        tot_count = self.count + batch_count
+        new_mean = self.main_mean + delta * batch_count / tot_count #Update the running mean
+        m_a = self.main_var * self.count
+        m_b = batch_var * batch_count
+        M2 = m_a + m_b + np.square(delta) * self.count * batch_count / tot_count
+        new_var = M2 / tot_count # update the running variance
+        self.main_mean, self.main_var, self.count = new_mean, new_var, tot_count
+    def normalize(self, x):
+        return (x - self.main_mean) / (np.sqrt(self.main_var) + 1e-8) # We add epsilon to make sure that we don't
+                                                            # divide through zero.
+class ReturnNorm:
+    '''
+    This class implements the Advantage Normalization. The purpose is to normalize either across batches or
+    only within the same batch.
+    '''
+    def __init__(self):
+        self.main_mean = 0
+        self.main_var = 0
+        self.count = 1e-4
+    def update(self, x: T.Tensor):
+        batch_mean = T.mean(x, dim=0)
+        batch_var = T.var(x, dim=0)
+        batch_count = x.shape[0]
+        self._update_from_moments(batch_mean, batch_var, batch_count)
+    def _update_from_moments(self, batch_mean, batch_var, batch_count):
+        delta = batch_mean - self.main_mean
+        tot_count = self.count + batch_count
+        new_mean = self.main_mean + delta * batch_count / tot_count #Update the running mean
+        m_a = self.main_var * self.count
+        m_b = batch_var * batch_count
+        M2 = m_a + m_b + np.square(delta) * self.count * batch_count / tot_count
+        new_var = M2 / tot_count # update the running variance
+        self.main_mean, self.main_var, self.count = new_mean, new_var, tot_count
+    def normalize(self, x):
+        return (x - self.main_mean) / (np.sqrt(self.main_var) + 1e-8) # We add epsilon to make sure that we don't
+                                                            # divide through zero.

Observation_Advantage_Norm_diff_combo/ppo_rew_norm_obs_env_diff_combo.py ADDED Viewed

	@@ -0,0 +1,201 @@

+import gymnasium as gym
+import sys
+import matplotlib.pyplot as plt
+import ale_py
+from ppo__rew_norm_obs_diff_combo import *
+from gymnasium.spaces import Box
+import cv2
+class PlotCreater:
+    def __init__(self):
+        self.fig = plt.figure(figsize=(12, 8))
+        self.ax2 = plt.subplot(221)
+        self.ax3 = plt.subplot(222)
+        self.ax4 = plt.subplot(223)
+        self.ax5 = plt.subplot(224)
+        """
+        # Plot for Return-Based Scaling only
+        ax1 = plt.subplot(220)
+        ax1.plot(agent.sigma_history, label="Return σ")
+        ax1.set_xlabel("PPO Update")
+        ax1.set_ylabel("σ (Return Std)")
+        """
+    def lossHistorySetting(self, loss_history, update_type):
+        self.ax2.plot(loss_history, label=update_type)
+    def rewardSetting(self, reward_history, update_type):
+        self.ax3.plot(reward_history, label=update_type)
+    def policyHistorySetting(self, policy_history, update_type):
+        self.ax4.plot(policy_history, label=update_type)
+    def valueLossSetting(self, value_loss_history, update_type):
+        self.ax5.plot(value_loss_history, label=update_type)
+    def setTitle(self, title):
+        self.fig.suptitle(title)
+    def plotShow(self):
+        self.ax2.set_ylabel("Average PPO Loss")
+        self.ax2.set_xlabel("PPO Update")
+        self.ax2.legend()
+        self.ax3.set_ylabel("Reward")
+        self.ax3.set_xlabel("PPO Update")
+        self.ax3.legend()
+        # Details about value loss and policy loss
+        self.ax4.set_ylabel("Policy Loss")
+        self.ax4.set_xlabel("Training Step")
+        self.ax4.legend()
+        self.ax5.set_ylabel("Value Loss")
+        self.ax5.set_xlabel("Training Step")
+        self.ax5.legend()
+        self.fig.suptitle("PPO Training Stability of type " +
+                          "-running_average")
+        self.fig.tight_layout()
+        self.fig.savefig( "Different_combination_"+ " running_average_.png")
+        plt.show()
+        print("Show the graph and store them")
+def preprocess(obs):
+    # Convert to grayscale
+    obs = cv2.cvtColor(obs, cv2.COLOR_RGB2GRAY)
+    # Resize
+    obs = cv2.resize(obs, (84, 84), interpolation=cv2.INTER_AREA)
+    # Add channel dimension and normalize
+    return np.expand_dims(obs, axis=0).astype(np.float32) / 255.0
+def rl_model(update_type, plotCreater):
+    # env = gym.make("ALE/SpaceInvaders-v5", render_mode='human')
+    # env = gym.make("ALE/Pacman-v5", render_mode="human")
+    env = gym.make("ALE/Pacman-v5")
+    episode = 0
+    total_return = 0
+    ep_return = 0
+    steps = 1000
+    batches = 100
+    print("Observation space:", env.observation_space)
+    print("Action space:", env.action_space)
+    """
+    agent = Agent(obs_space=env.observation_space, action_space=env.action_space,
+                  hidden=64, lr=3e-4, gamma=0.99, clip_coef=0.2,
+                  entropy_coef=0.01, value_coef=0.5, seed=70,
+                  batch_size = 64, ppo_epochs = 4, lam = 0.95)
+    """
+    # Initialize CNN with a dummy observation (to get correct input shape)
+    obs, _ = env.reset()
+    dummy_obs_space = Box(low=0.0, high=1.0, shape=preprocess(obs).shape)
+    agent = Agent(obs_space=dummy_obs_space, action_space=env.action_space,
+                  hidden=64, lr=3e-4, gamma=0.99, clip_coef=0.2,
+                  entropy_coef=0.01, value_coef=0.5, seed=70,
+                  batch_size=64, ppo_epochs=4, lam=0.95, update_type=update_type)
+    """
+    # Stats for Return-Based Scaling only
+    # === Return-Based Scaling stats ===
+    r_mean, r_var = 0.0, 1e-8
+    g2_mean = 1.0
+    agent.r_var = r_var
+    agent.g2_mean = g2_mean
+    """
+    try:
+        obs, info = env.reset(seed=42)
+        state = preprocess(obs)
+        loss_history = []
+        reward_history = []
+        for update in range(1, batches + 1):
+            for t in range(steps):
+                action, logp, value = agent.choose_action(state)
+                next_obs, reward, terminated, truncated, info = env.step(action)
+                done = terminated or truncated
+                next_state = preprocess(next_obs)
+                agent.remember(state, action, reward, done, logp, value, next_state)
+                ep_return += reward
+                state = next_state
+                if done:
+                    episode += 1
+                    total_return += ep_return
+                    print(f"Episode {episode} return: {ep_return:.2f}")
+                    ep_return = 0
+                    obs, info = env.reset()
+                    state = preprocess(obs)
+            # Using reward gradient clipping
+            avg_loss = agent._update()
+            # Vanilla PPO (no normalization)
+            # avg_loss = agent.vanilla_ppo_update()
+            loss_history.append(avg_loss)
+            avg_ret = (total_return / episode) if episode else 0
+            reward_history.append(avg_ret)
+            print(f"Update {update}: episodes={episode}, avg_return={avg_ret:.2f}, avg_loss={avg_loss:.4f}")
+        plotCreater.lossHistorySetting(loss_history, update_type)
+        plotCreater.rewardSetting(reward_history, update_type)
+        plotCreater.policyHistorySetting(agent.policy_loss_history, update_type)
+        plotCreater.valueLossSetting(agent.value_loss_history, update_type)
+    except Exception as e:
+        print(f"Error: {e}", file=sys.stderr)
+        return 1
+    finally:
+        avg = total_return / episode if episode else 0
+        print(f"\nEpisodes: {episode}, Avg return: {avg:.3f}")
+        env.close()
+    return 0
+def main() -> int:
+    combo_type_list = ["update_all_norm", "update_observation_advantage_norm"
+                       , "update_observation_return_norm", "update_advantage_return_norm"]
+    type_list = ["update_observation_norm", "update_advantage_norm", "update_return_norm", "vanilla_ppo_update"]
+    plotCreater = PlotCreater()
+    for update_type in combo_type_list:
+        rl_model(update_type, plotCreater)
+    plotCreater.plotShow()
+    return 0
+if __name__ == "__main__":
+    raise SystemExit(main())

Observation_Advantage_Norm_diff_env/ppo__rew_norm_obs_diff_env.py ADDED Viewed

	@@ -0,0 +1,891 @@

+import numpy as np
+import torch as T
+import torch.nn as nn
+import torch.optim as optim
+from torch.distributions import Categorical
+class Agent:
+    def __init__(
+            self,
+            obs_space,
+            action_space,
+            hidden,
+            gamma,
+            clip_coef,
+            lr,
+            value_coef,
+            entropy_coef,
+            seed,
+            batch_size,
+            ppo_epochs,
+            lam,
+            update_type
+    ):
+        # Initialize seed for reproducibility
+        if seed is not None:
+            np.random.seed(seed)
+            T.manual_seed(seed)
+        """
+        # For flat observations (MLP model)
+        # Use GPU if available
+        self.device = T.device('cuda:0' if T.cuda.is_available() else 'cpu')
+        self.obs_dim = int(np.prod(getattr(obs_space, "shape", (obs_space,))))
+        self.action_dim = int(getattr(action_space, "n", action_space))
+        # Initialize the policy and the critic networks
+        self.policy = Policy(self.obs_dim, self.action_dim, hidden).to(self.device)
+        self.critic = Critic(self.obs_dim, hidden).to(self.device)
+        """
+        # Use GPU if available
+        self.device = T.device('cuda:0' if T.cuda.is_available() else 'cpu')
+        self.action_dim = int(getattr(action_space, "n", action_space))
+        self.update_type = update_type
+        # Initialize the policy and the critic networks
+        # Pass the shape tuple directly, not the flattened dimension.
+        self.policy = Policy(obs_space.shape, self.action_dim, hidden).to(self.device)
+        self.critic = Critic(obs_space.shape, hidden).to(self.device)
+        self.observeNorm = ObservationNorm()
+        self.advantageNorm = AdvantageNorm()
+        self.returnNorm = ReturnNorm()
+        # Set optimizer for policy and critic networks
+        self.opt = optim.Adam(
+            list(self.policy.parameters()) + list(self.critic.parameters()),
+            lr=lr
+        )
+        self.gamma = gamma
+        self.clip = clip_coef
+        self.value_coef = value_coef
+        self.entropy_coef = entropy_coef
+        self.sigma_history = []
+        self.loss_history = []
+        self.policy_loss_history = []
+        self.value_loss_history = []
+        self.entropy_history = []
+        self.lam = lam
+        self.ppo_epochs = ppo_epochs
+        self.batch_size = batch_size
+        self.memory = Memory()
+    """
+    # Choose action and remember for flat observations (MLP model)
+    def choose_action(self, observation):
+        # Returns: action, log probabilitiy, value of the state
+        state = T.as_tensor(observation, dtype=T.float32, device=self.device).view(-1)
+        with T.no_grad():
+            # Forward function (defined in Policy class)
+            dist = self.policy.next_action(state)
+            action = dist.sample()
+            logp = dist.log_prob(action)
+            value = self.critic.evaluated_state(state)
+        return int(action.item()), float(logp.item()), float(value.item())
+    def remember(self, state, action, reward, done, log_prob, value, next_state):
+        with T.no_grad():
+            # Pass on next state and have it evaluated by the critic network
+            ns = T.as_tensor(next_state, dtype=T.float32, device=self.device).view(-1)
+            next_value = self.critic.evaluated_state(ns).item()
+        self.memory.store(state, action, reward, done, log_prob, value, next_value)
+    """
+    # For CNN model
+    def choose_action(self, observation):
+        # Returns: action, log probabilitiy, value of the state
+        state = T.as_tensor(observation, dtype=T.float32, device=self.device) # Remove .view(-1)
+        with T.no_grad():
+            # Forward function (defined in Policy class)
+            dist = self.policy.next_action(state)
+            action = dist.sample()
+            logp = dist.log_prob(action)
+            value = self.critic.evaluated_state(state)
+        return int(action.item()), float(logp.item()), float(value.item())
+    def remember(self, state, action, reward, done, log_prob, value, next_state):
+        with T.no_grad():
+            # Pass on next state and have it evaluated by the critic network
+            ns = T.as_tensor(next_state, dtype=T.float32, device=self.device) # Remove .view(-1)
+            next_value = self.critic.evaluated_state(ns).item()
+        self.memory.store(state, action, reward, done, log_prob, value, next_value)
+    def _update(self):
+        if self.update_type == "update_observation_norm":
+            return self.update_observation_norm()
+        elif self.update_type == "update_advantage_norm":
+            return self.update_advantage_norm()
+        elif self.update_type == "update_return_norm":
+            return self.update_return_norm()
+        else:
+            return self.vanilla_ppo_update()
+    def vanilla_ppo_update(self):
+        if len(self.memory.states) == 0:
+            return 0.0
+        # Convert memory to tensors
+        states = T.as_tensor(np.array(self.memory.states), dtype=T.float32, device=self.device)
+        actions = T.as_tensor(self.memory.actions, dtype=T.long, device=self.device)
+        rewards = T.as_tensor(self.memory.rewards, dtype=T.float32, device=self.device)
+        dones = T.as_tensor(self.memory.dones, dtype=T.float32, device=self.device)
+        old_logp = T.as_tensor(self.memory.log_probs, dtype=T.float32, device=self.device)
+        values = T.as_tensor(self.memory.values, dtype=T.float32, device=self.device)
+        with T.no_grad():
+            # Compute next values (bootstrap for final step)
+            next_values = T.cat([values[1:], values[-1:].clone()])
+            deltas = rewards + self.gamma * next_values * (1 - dones) - values
+            # --- GAE-Lambda ---
+            adv = T.zeros_like(rewards)
+            gae = 0.0
+            for t in reversed(range(len(rewards))):
+                gae = deltas[t] + self.gamma * self.lam * (1 - dones[t]) * gae
+                adv[t] = gae
+            returns = adv + values
+            # Advantage normalization
+            adv = (adv - adv.mean()) / (adv.std(unbiased=False) + 1e-8)
+        # --- PPO Multiple Epochs + Minibatch ---
+        total_loss_epoch = 0.0
+        num_samples = len(states)
+        batch_size = min(64, num_samples)
+        ppo_epochs = 4
+        for _ in range(ppo_epochs):
+            # Shuffle indices
+            idxs = T.randperm(num_samples)
+            for start in range(0, num_samples, batch_size):
+                batch_idx = idxs[start:start + batch_size]
+                b_states = states[batch_idx]
+                b_actions = actions[batch_idx]
+                b_old_logp = old_logp[batch_idx]
+                b_returns = returns[batch_idx]
+                b_adv = adv[batch_idx]
+                dist = self.policy.next_action(b_states)
+                new_logp = dist.log_prob(b_actions)
+                entropy = dist.entropy().mean()
+                ratio = (new_logp - b_old_logp).exp()
+                # --- Clipped surrogate objective ---
+                surr1 = ratio * b_adv
+                surr2 = T.clamp(ratio, 1 - self.clip, 1 + self.clip) * b_adv
+                policy_loss = -T.min(surr1, surr2).mean()
+                # --- Critic loss ---
+                value_pred = self.critic.evaluated_state(b_states)
+                value_loss = 0.5 * (b_returns - value_pred).pow(2).mean()
+                # --- Total loss ---
+                total_loss = (
+                        policy_loss +
+                        self.value_coef * value_loss -
+                        self.entropy_coef * entropy
+                )
+                # Debug: track individual loss components
+                self.policy_loss_history.append(policy_loss.item())
+                self.value_loss_history.append(value_loss.item())
+                self.opt.zero_grad(set_to_none=True)
+                total_loss.backward()
+                self.opt.step()
+                total_loss_epoch += total_loss.item()
+        # Clear memory after full PPO update
+        self.memory.clear()
+        return total_loss_epoch / (ppo_epochs * (num_samples / batch_size))
+    def update_rbs(self):
+        if len(self.memory.states) == 0:
+            return 0.0
+        # Convert memory to tensors
+        states = T.as_tensor(np.array(self.memory.states), dtype=T.float32, device=self.device)
+        actions = T.as_tensor(self.memory.actions, dtype=T.long, device=self.device)
+        rewards = T.as_tensor(self.memory.rewards, dtype=T.float32, device=self.device)
+        dones = T.as_tensor(self.memory.dones, dtype=T.float32, device=self.device)
+        old_logp = T.as_tensor(self.memory.log_probs, dtype=T.float32, device=self.device)
+        values = T.as_tensor(self.memory.values, dtype=T.float32, device=self.device)
+        with T.no_grad():
+            # Compute next values (bootstrap for final step)
+            next_values = T.cat([values[1:], values[-1:].clone()])
+            deltas = rewards + self.gamma * next_values * (1 - dones) - values
+            # --- GAE-Lambda ---
+            adv = T.zeros_like(rewards)
+            gae = 0.0
+            for t in reversed(range(len(rewards))):
+                gae = deltas[t] + self.gamma * self.lam * (1 - dones[t]) * gae
+                adv[t] = gae
+            returns = adv + values
+            # --- Return-based normalization (RBS) ---
+            sigma_t = returns.std(unbiased=False) + 1e-8
+            returns = returns / sigma_t
+            self.sigma_history.append(sigma_t.item())
+            adv = adv / sigma_t
+            # Advantage normalization
+            adv = (adv - adv.mean()) / (adv.std(unbiased=False) + 1e-8)
+        # --- PPO Multiple Epochs + Minibatch ---
+        total_loss_epoch = 0.0
+        num_samples = len(states)
+        batch_size = min(64, num_samples)
+        ppo_epochs = 4
+        for _ in range(ppo_epochs):
+            # Shuffle indices
+            idxs = T.randperm(num_samples)
+            for start in range(0, num_samples, batch_size):
+                batch_idx = idxs[start:start + batch_size]
+                b_states = states[batch_idx]
+                b_actions = actions[batch_idx]
+                b_old_logp = old_logp[batch_idx]
+                b_returns = returns[batch_idx]
+                b_adv = adv[batch_idx]
+                dist = self.policy.next_action(b_states)
+                new_logp = dist.log_prob(b_actions)
+                entropy = dist.entropy().mean()
+                ratio = (new_logp - b_old_logp).exp()
+                # --- Clipped surrogate objective ---
+                surr1 = ratio * b_adv
+                surr2 = T.clamp(ratio, 1 - self.clip, 1 + self.clip) * b_adv
+                policy_loss = -T.min(surr1, surr2).mean()
+                # --- Critic loss ---
+                value_pred = self.critic.evaluated_state(b_states)
+                value_loss = 0.5 * (b_returns - value_pred).pow(2).mean()
+                # --- Total loss ---
+                total_loss = (
+                        policy_loss +
+                        self.value_coef * value_loss -
+                        self.entropy_coef * entropy
+                )
+                # Debug: track individual loss components
+                self.policy_loss_history.append(policy_loss.item())
+                self.value_loss_history.append(value_loss.item())
+                self.opt.zero_grad(set_to_none=True)
+                total_loss.backward()
+                self.opt.step()
+                total_loss_epoch += total_loss.item()
+        # Clear memory after full PPO update
+        self.memory.clear()
+        return total_loss_epoch / (ppo_epochs * (num_samples / batch_size))
+    def update_observation_norm(self):
+        if len(self.memory.states) == 0:
+            return 0.0
+        # Convert memory to tensors
+        states = T.as_tensor(np.array(self.memory.states), dtype=T.float32, device=self.device)
+        actions = T.as_tensor(self.memory.actions, dtype=T.long, device=self.device)
+        rewards = T.as_tensor(self.memory.rewards, dtype=T.float32, device=self.device)
+        dones = T.as_tensor(self.memory.dones, dtype=T.float32, device=self.device)
+        old_logp = T.as_tensor(self.memory.log_probs, dtype=T.float32, device=self.device)
+        values = T.as_tensor(self.memory.values, dtype=T.float32, device=self.device)
+        with T.no_grad():
+            # Compute next values (bootstrap for final step)
+            next_values = T.cat([values[1:], values[-1:].clone()])
+            deltas = rewards + self.gamma * next_values * (1 - dones) - values
+            # --- GAE-Lambda ---
+            adv = T.zeros_like(rewards)
+            gae = 0.0
+            for t in reversed(range(len(rewards))):
+                gae = deltas[t] + self.gamma * self.lam * (1 - dones[t]) * gae
+                adv[t] = gae
+            returns = adv + values
+            # --- observation normalization ---
+            self.observeNorm.update(states)
+            states = self.observeNorm.normalize(states)
+            # Advantage normalization
+            adv = (adv - adv.mean()) / (adv.std(unbiased=False) + 1e-8)
+        # --- PPO Multiple Epochs + Minibatch ---
+        total_loss_epoch = 0.0
+        num_samples = len(states)
+        batch_size = min(64, num_samples)
+        ppo_epochs = 4
+        for _ in range(ppo_epochs):
+            # Shuffle indices
+            idxs = T.randperm(num_samples)
+            for start in range(0, num_samples, batch_size):
+                batch_idx = idxs[start:start + batch_size]
+                b_states = states[batch_idx]
+                b_actions = actions[batch_idx]
+                b_old_logp = old_logp[batch_idx]
+                b_returns = returns[batch_idx]
+                b_adv = adv[batch_idx]
+                dist = self.policy.next_action(b_states)
+                new_logp = dist.log_prob(b_actions)
+                entropy = dist.entropy().mean()
+                ratio = (new_logp - b_old_logp).exp()
+                # --- Clipped surrogate objective ---
+                surr1 = ratio * b_adv
+                surr2 = T.clamp(ratio, 1 - self.clip, 1 + self.clip) * b_adv
+                policy_loss = -T.min(surr1, surr2).mean()
+                # --- Critic loss ---
+                value_pred = self.critic.evaluated_state(b_states)
+                value_loss = 0.5 * (b_returns - value_pred).pow(2).mean()
+                # --- Total loss ---
+                total_loss = (
+                        policy_loss +
+                        self.value_coef * value_loss -
+                        self.entropy_coef * entropy
+                )
+                # Debug: track individual loss components
+                self.policy_loss_history.append(policy_loss.item())
+                self.value_loss_history.append(value_loss.item())
+                self.opt.zero_grad(set_to_none=True)
+                total_loss.backward()
+                self.opt.step()
+                total_loss_epoch += total_loss.item()
+        # Clear memory after full PPO update
+        self.memory.clear()
+        return total_loss_epoch / (ppo_epochs * (num_samples / batch_size))
+    def update_advantage_norm(self):
+        if len(self.memory.states) == 0:
+            return 0.0
+        # Convert memory to tensors
+        states = T.as_tensor(np.array(self.memory.states), dtype=T.float32, device=self.device)
+        actions = T.as_tensor(self.memory.actions, dtype=T.long, device=self.device)
+        rewards = T.as_tensor(self.memory.rewards, dtype=T.float32, device=self.device)
+        dones = T.as_tensor(self.memory.dones, dtype=T.float32, device=self.device)
+        old_logp = T.as_tensor(self.memory.log_probs, dtype=T.float32, device=self.device)
+        values = T.as_tensor(self.memory.values, dtype=T.float32, device=self.device)
+        with T.no_grad():
+            # Compute next values (bootstrap for final step)
+            next_values = T.cat([values[1:], values[-1:].clone()])
+            deltas = rewards + self.gamma * next_values * (1 - dones) - values
+            # --- GAE-Lambda ---
+            adv = T.zeros_like(rewards)
+            gae = 0.0
+            for t in reversed(range(len(rewards))):
+                gae = deltas[t] + self.gamma * self.lam * (1 - dones[t]) * gae
+                adv[t] = gae
+            # --- Advantage normalization ---
+            self.advantageNorm.update(adv)
+            adv = self.observeNorm.normalize(adv)
+            returns = adv + values
+        # --- PPO Multiple Epochs + Minibatch ---
+        total_loss_epoch = 0.0
+        num_samples = len(states)
+        batch_size = min(64, num_samples)
+        ppo_epochs = 4
+        for _ in range(ppo_epochs):
+            # Shuffle indices
+            idxs = T.randperm(num_samples)
+            for start in range(0, num_samples, batch_size):
+                batch_idx = idxs[start:start + batch_size]
+                b_states = states[batch_idx]
+                b_actions = actions[batch_idx]
+                b_old_logp = old_logp[batch_idx]
+                b_returns = returns[batch_idx]
+                b_adv = adv[batch_idx]
+                dist = self.policy.next_action(b_states)
+                new_logp = dist.log_prob(b_actions)
+                entropy = dist.entropy().mean()
+                ratio = (new_logp - b_old_logp).exp()
+                # --- Clipped surrogate objective ---
+                surr1 = ratio * b_adv
+                surr2 = T.clamp(ratio, 1 - self.clip, 1 + self.clip) * b_adv
+                policy_loss = -T.min(surr1, surr2).mean()
+                # --- Critic loss ---
+                value_pred = self.critic.evaluated_state(b_states)
+                value_loss = 0.5 * (b_returns - value_pred).pow(2).mean()
+                # --- Total loss ---
+                total_loss = (
+                        policy_loss +
+                        self.value_coef * value_loss -
+                        self.entropy_coef * entropy
+                )
+                # Debug: track individual loss components
+                self.policy_loss_history.append(policy_loss.item())
+                self.value_loss_history.append(value_loss.item())
+                self.opt.zero_grad(set_to_none=True)
+                total_loss.backward()
+                self.opt.step()
+                total_loss_epoch += total_loss.item()
+        # Clear memory after full PPO update
+        self.memory.clear()
+        return total_loss_epoch / (ppo_epochs * (num_samples / batch_size))
+    def update_return_norm(self):
+        if len(self.memory.states) == 0:
+            return 0.0
+        # Convert memory to tensors
+        states = T.as_tensor(np.array(self.memory.states), dtype=T.float32, device=self.device)
+        actions = T.as_tensor(self.memory.actions, dtype=T.long, device=self.device)
+        rewards = T.as_tensor(self.memory.rewards, dtype=T.float32, device=self.device)
+        dones = T.as_tensor(self.memory.dones, dtype=T.float32, device=self.device)
+        old_logp = T.as_tensor(self.memory.log_probs, dtype=T.float32, device=self.device)
+        values = T.as_tensor(self.memory.values, dtype=T.float32, device=self.device)
+        with T.no_grad():
+            # Compute next values (bootstrap for final step)
+            next_values = T.cat([values[1:], values[-1:].clone()])
+            deltas = rewards + self.gamma * next_values * (1 - dones) - values
+            # --- GAE-Lambda ---
+            adv = T.zeros_like(rewards)
+            gae = 0.0
+            for t in reversed(range(len(rewards))):
+                gae = deltas[t] + self.gamma * self.lam * (1 - dones[t]) * gae
+                adv[t] = gae
+            returns = adv + values
+            # --- returns normalization ---
+            self.returnNorm.update(returns)
+            returns = self.returnNorm.normalize(returns)
+            # Advantage normalization
+            adv = (adv - adv.mean()) / (adv.std(unbiased=False) + 1e-8)
+        # --- PPO Multiple Epochs + Minibatch ---
+        total_loss_epoch = 0.0
+        num_samples = len(states)
+        batch_size = min(64, num_samples)
+        ppo_epochs = 4
+        for _ in range(ppo_epochs):
+            # Shuffle indices
+            idxs = T.randperm(num_samples)
+            for start in range(0, num_samples, batch_size):
+                batch_idx = idxs[start:start + batch_size]
+                b_states = states[batch_idx]
+                b_actions = actions[batch_idx]
+                b_old_logp = old_logp[batch_idx]
+                b_returns = returns[batch_idx]
+                b_adv = adv[batch_idx]
+                dist = self.policy.next_action(b_states)
+                new_logp = dist.log_prob(b_actions)
+                entropy = dist.entropy().mean()
+                ratio = (new_logp - b_old_logp).exp()
+                # --- Clipped surrogate objective ---
+                surr1 = ratio * b_adv
+                surr2 = T.clamp(ratio, 1 - self.clip, 1 + self.clip) * b_adv
+                policy_loss = -T.min(surr1, surr2).mean()
+                # --- Critic loss ---
+                value_pred = self.critic.evaluated_state(b_states)
+                value_loss = 0.5 * (b_returns - value_pred).pow(2).mean()
+                # --- Total loss ---
+                total_loss = (
+                        policy_loss +
+                        self.value_coef * value_loss -
+                        self.entropy_coef * entropy
+                )
+                # Debug: track individual loss components
+                self.policy_loss_history.append(policy_loss.item())
+                self.value_loss_history.append(value_loss.item())
+                self.opt.zero_grad(set_to_none=True)
+                total_loss.backward()
+                self.opt.step()
+                total_loss_epoch += total_loss.item()
+        # Clear memory after full PPO update
+        self.memory.clear()
+        return total_loss_epoch / (ppo_epochs * (num_samples / batch_size))
+    def update_reward_gradient_clipping(self):
+        if len(self.memory.states) == 0:
+            return 0.0
+        # Convert memory to tensors
+        states = T.as_tensor(np.array(self.memory.states), dtype=T.float32, device=self.device)
+        actions = T.as_tensor(self.memory.actions, dtype=T.long, device=self.device)
+        rewards = T.as_tensor(self.memory.rewards, dtype=T.float32, device=self.device)
+        dones = T.as_tensor(self.memory.dones, dtype=T.float32, device=self.device)
+        old_logp = T.as_tensor(self.memory.log_probs, dtype=T.float32, device=self.device)
+        values = T.as_tensor(self.memory.values, dtype=T.float32, device=self.device)
+        # Reward clipping
+        rewards = T.clamp(rewards, -1, 1)
+        with T.no_grad():
+            # Compute next values (bootstrap for final step)
+            next_values = T.cat([values[1:], values[-1:].clone()])
+            deltas = rewards + self.gamma * next_values * (1 - dones) - values
+            # --- GAE-Lambda ---
+            adv = T.zeros_like(rewards)
+            gae = 0.0
+            for t in reversed(range(len(rewards))):
+                gae = deltas[t] + self.gamma * self.lam * (1 - dones[t]) * gae
+                adv[t] = gae
+            returns = adv + values
+            # Advantage normalization
+            adv = (adv - adv.mean()) / (adv.std(unbiased=False) + 1e-8)
+        # --- PPO Multiple Epochs + Minibatch ---
+        total_loss_epoch = 0.0
+        num_samples = len(states)
+        batch_size = min(64, num_samples)
+        ppo_epochs = 4
+        for _ in range(ppo_epochs):
+            # Shuffle indices
+            idxs = T.randperm(num_samples)
+            for start in range(0, num_samples, batch_size):
+                batch_idx = idxs[start:start + batch_size]
+                b_states = states[batch_idx]
+                b_actions = actions[batch_idx]
+                b_old_logp = old_logp[batch_idx]
+                b_returns = returns[batch_idx]
+                b_adv = adv[batch_idx]
+                dist = self.policy.next_action(b_states)
+                new_logp = dist.log_prob(b_actions)
+                entropy = dist.entropy().mean()
+                ratio = (new_logp - b_old_logp).exp()
+                # --- Clipped surrogate objective ---
+                surr1 = ratio * b_adv
+                surr2 = T.clamp(ratio, 1 - self.clip, 1 + self.clip) * b_adv
+                policy_loss = -T.min(surr1, surr2).mean()
+                # --- Critic loss ---
+                value_pred = self.critic.evaluated_state(b_states)
+                value_loss = 0.5 * (b_returns - value_pred).pow(2).mean()
+                # --- Total loss ---
+                total_loss = (
+                        policy_loss +
+                        self.value_coef * value_loss -
+                        self.entropy_coef * entropy
+                )
+                # Debug: track individual loss components
+                self.policy_loss_history.append(policy_loss.item())
+                self.value_loss_history.append(value_loss.item())
+                self.opt.zero_grad(set_to_none=True)
+                total_loss.backward()
+                T.nn.utils.clip_grad_norm_(list(self.policy.parameters()) + list(self.critic.parameters()), 0.5)
+                self.opt.step()
+                total_loss_epoch += total_loss.item()
+        # Clear memory after full PPO update
+        self.memory.clear()
+        return total_loss_epoch / (ppo_epochs * (num_samples / batch_size))
+"""
+# Policy network (simple MLP, flattened observations)
+class Policy(nn.Module):
+    def __init__(self, obs_dim: int, action_dim: int, hidden: int):
+        super().__init__()
+        self.net = nn.Sequential(
+            nn.Linear(obs_dim, hidden),
+            nn.ReLU(),
+            nn.Linear(hidden, hidden),
+            nn.ReLU(),
+            nn.Linear(hidden, action_dim)
+        )
+    def next_action(self, state: T.Tensor) -> Categorical:
+        # Returns the probability distribution over actions
+        if state.dim() == 1:
+            state = state.unsqueeze(0)
+        state = state.view(state.size(0), -1)
+        return Categorical(logits=self.net(state))
+"""
+# Policy network (CNN)
+class Policy(nn.Module):
+    def __init__(self, obs_shape: tuple, action_dim: int, hidden: int):
+        super().__init__()
+        c, h, w = obs_shape
+        # Suggested architecture for Atari: https://arxiv.org/pdf/1312.5602
+        self.cnn = nn.Sequential(
+            nn.Conv2d(c, 16, kernel_size=8, stride=4),
+            nn.ReLU(),
+            nn.Conv2d(16, 32, kernel_size=4, stride=2),
+            nn.ReLU(),
+            nn.Flatten()
+        )
+        with T.no_grad():
+            cnn_output_dim = self.cnn(T.zeros(1, c, h, w)).shape[1]
+        self.net = nn.Sequential(
+            nn.Linear(cnn_output_dim, hidden),
+            nn.ReLU(),
+            nn.Linear(hidden, action_dim)
+        )
+    def next_action(self, state: T.Tensor) -> Categorical:
+        # Returns the probability distribution over actions
+        if state.dim() == 3:
+            state = state.unsqueeze(0)
+        cnn_out = self.cnn(state)
+        return Categorical(logits=self.net(cnn_out))
+"""
+# Critic network (simple MLP, flattened observations)
+class Critic(nn.Module):
+    def __init__(self, obs_dim: int, hidden: int):
+        super().__init__()
+        self.net = nn.Sequential(
+            nn.Linear(obs_dim, hidden),
+            nn.ReLU(),
+            nn.Linear(hidden, hidden),
+            nn.ReLU(),
+            nn.Linear(hidden, 1)
+        )
+    def evaluated_state(self, x: T.Tensor) -> T.Tensor:
+        if x.dim() == 1:
+            x = x.unsqueeze(0)
+        x = x.view(x.size(0), -1)
+        return self.net(x).squeeze(-1)
+"""
+# Critic network (CNN)
+class Critic(nn.Module):
+    def __init__(self, obs_shape: tuple, hidden: int):
+        super().__init__()
+        c, h, w = obs_shape
+        # Suggested architecture for Atari: https://arxiv.org/pdf/1312.5602
+        self.cnn = nn.Sequential(
+            nn.Conv2d(c, 16, kernel_size=8, stride=4),
+            nn.ReLU(),
+            nn.Conv2d(16, 32, kernel_size=4, stride=2),
+            nn.ReLU(),
+            nn.Flatten()
+        )
+        with T.no_grad():
+            cnn_output_dim = self.cnn(T.zeros(1, c, h, w)).shape[1]
+        self.net = nn.Sequential(
+            nn.Linear(cnn_output_dim, hidden),
+            nn.ReLU(),
+            nn.Linear(hidden, 1)
+        )
+    def evaluated_state(self, x: T.Tensor) -> T.Tensor:
+        if x.dim() == 3:
+            x = x.unsqueeze(0)
+        cnn_out = self.cnn(x)
+        return self.net(cnn_out).squeeze(-1)
+class Memory():
+    def __init__(self):
+        self.states = []
+        self.actions = []
+        self.rewards = []
+        self.dones = []
+        self.log_probs = []
+        self.values = []
+        self.next_values = []
+    def store(self, state, action, reward, done, log_prob, value, next_value):
+        self.states.append(np.asarray(state, dtype=np.float32))
+        self.actions.append(int(action))
+        self.rewards.append(float(reward))
+        self.dones.append(float(done))
+        self.log_probs.append(float(log_prob))
+        self.values.append(float(value))
+        self.next_values.append(float(next_value))
+    """
+    # For mini-batch updates? To be implemented
+    def start_batch(self, batch_size: int):
+        n_states = len(self.states)
+        starts = np.arange(0, n_states, batch_size)
+        index = np.arange(n_states, dtype=np.int64)
+        np.random.shuffle(index)
+        return [index[s:s + batch_size] for s in starts]
+    """
+    def clear(self):
+        self.states = []
+        self.actions = []
+        self.rewards = []
+        self.dones = []
+        self.log_probs = []
+        self.values = []
+        self.next_values = []
+class ObservationNorm:
+    def __init__(self):
+        self.main_mean = 0
+        self.main_var = 0
+        self.count = 1e-4
+    def update(self, x: T.Tensor):
+        batch_mean = T.mean(x, dim=0)
+        batch_var = T.var(x, dim=0)
+        batch_count = x.shape[0]
+        self._update_from_moments(batch_mean, batch_var, batch_count)
+    def _update_from_moments(self, batch_mean, batch_var, batch_count):
+        delta = batch_mean - self.main_mean
+        tot_count = self.count + batch_count
+        new_mean = self.main_mean + delta * batch_count / tot_count #Update the running mean
+        m_a = self.main_var * self.count
+        m_b = batch_var * batch_count
+        M2 = m_a + m_b + np.square(delta) * self.count * batch_count / tot_count
+        new_var = M2 / tot_count # update the running variance
+        self.main_mean, self.main_var, self.count = new_mean, new_var, tot_count
+    def normalize(self, x):
+        return (x - self.main_mean) / (np.sqrt(self.main_var) + 1e-8) # We add epsilon to make sure that we don't
+                                                            # divide through zero.
+class AdvantageNorm:
+    '''
+    This class implements the Advantage Normalization. The purpose is to normalize either across batches or
+    only within the same batch.
+    '''
+    def __init__(self):
+        self.main_mean = 0
+        self.main_var = 0
+        self.count = 1e-4
+    def update(self, x: T.Tensor):
+        batch_mean = T.mean(x, dim=0)
+        batch_var = T.var(x, dim=0)
+        batch_count = x.shape[0]
+        self._update_from_moments(batch_mean, batch_var, batch_count)
+    def _update_from_moments(self, batch_mean, batch_var, batch_count):
+        delta = batch_mean - self.main_mean
+        tot_count = self.count + batch_count
+        new_mean = self.main_mean + delta * batch_count / tot_count #Update the running mean
+        m_a = self.main_var * self.count
+        m_b = batch_var * batch_count
+        M2 = m_a + m_b + np.square(delta) * self.count * batch_count / tot_count
+        new_var = M2 / tot_count # update the running variance
+        self.main_mean, self.main_var, self.count = new_mean, new_var, tot_count
+    def normalize(self, x):
+        return (x - self.main_mean) / (np.sqrt(self.main_var) + 1e-8) # We add epsilon to make sure that we don't
+                                                            # divide through zero.
+class ReturnNorm:
+    '''
+    This class implements the Advantage Normalization. The purpose is to normalize either across batches or
+    only within the same batch.
+    '''
+    def __init__(self):
+        self.main_mean = 0
+        self.main_var = 0
+        self.count = 1e-4
+    def update(self, x: T.Tensor):
+        batch_mean = T.mean(x, dim=0)
+        batch_var = T.var(x, dim=0)
+        batch_count = x.shape[0]
+        self._update_from_moments(batch_mean, batch_var, batch_count)
+    def _update_from_moments(self, batch_mean, batch_var, batch_count):
+        delta = batch_mean - self.main_mean
+        tot_count = self.count + batch_count
+        new_mean = self.main_mean + delta * batch_count / tot_count #Update the running mean
+        m_a = self.main_var * self.count
+        m_b = batch_var * batch_count
+        M2 = m_a + m_b + np.square(delta) * self.count * batch_count / tot_count
+        new_var = M2 / tot_count # update the running variance
+        self.main_mean, self.main_var, self.count = new_mean, new_var, tot_count
+    def normalize(self, x):
+        return (x - self.main_mean) / (np.sqrt(self.main_var) + 1e-8) # We add epsilon to make sure that we don't
+                                                            # divide through zero.

Observation_Advantage_Norm_diff_env/ppo_rew_norm_obs_env_diff_env.py ADDED Viewed

	@@ -0,0 +1,191 @@

+import gymnasium as gym
+import sys
+import matplotlib.pyplot as plt
+import ale_py
+from ppo__rew_norm_obs_diff_env import *
+from gymnasium.spaces import Box
+import cv2
+def preprocess(obs):
+    # Convert to grayscale
+    obs = cv2.cvtColor(obs, cv2.COLOR_RGB2GRAY)
+    # Resize
+    obs = cv2.resize(obs, (84, 84), interpolation=cv2.INTER_AREA)
+    # Add channel dimension and normalize
+    return np.expand_dims(obs, axis=0).astype(np.float32) / 255.0
+class PlotMultiple:
+    def __init__(self):
+        self.fig = plt.figure(figsize=(12, 8))
+        """
+        # Plot for Return-Based Scaling only
+        ax1 = plt.subplot(220)
+        ax1.plot(agent.sigma_history, label="Return σ")
+        ax1.set_xlabel("PPO Update")
+        ax1.set_ylabel("σ (Return Std)")
+        """
+        self.ax2 = plt.subplot(221)
+        self.ax2.set_ylabel("Average PPO Loss")
+        self.ax2.set_xlabel("PPO Update")
+        self.ax3 = plt.subplot(222)
+        self.ax3.set_ylabel("Reward")
+        self.ax3.set_xlabel("PPO Update")
+        # Details about value loss and policy loss
+        self.ax4 = plt.subplot(223)
+        self.ax4.set_ylabel("Policy Loss")
+        self.ax4.set_xlabel("Training Step")
+        self.ax4.legend()
+        self.ax5 = plt.subplot(224)
+        self.ax5.set_ylabel("Value Loss")
+        self.ax5.set_xlabel("Training Step")
+        self.ax5.legend()
+    def setPlot(self, loss_history, reward_history, policy_loss_history
+                , value_loss_history, env ):
+        self.ax2.plot(loss_history, label=env, title = "Loss")
+        self.ax3.plot(reward_history, label=env, title="Reward")
+        self.ax4.plot(policy_loss_history, label=env,title = "policy_loss", alpha=0.7)
+        self.ax5.plot(value_loss_history, label=env, title = "value_loss", alpha=0.7)
+    def store(self, environ):
+        self.fig.suptitle("Performance with different Environments")
+        self.fig.tight_layout()
+        self.fig.savefig("Performance of "+environ + " with different_environment_.png")
+def rl_model(type, plot, environ):
+    # env = gym.make("ALE/SpaceInvaders-v5", render_mode='human')
+    # env = gym.make("ALE/Pacman-v5", render_mode="human")
+    env = gym.make(environ)
+    episode = 0
+    total_return = 0
+    ep_return = 0
+    steps = 1000
+    batches = 100
+    print("Observation space:", env.observation_space)
+    print("Action space:", env.action_space)
+    """
+    agent = Agent(obs_space=env.observation_space, action_space=env.action_space,
+                  hidden=64, lr=3e-4, gamma=0.99, clip_coef=0.2,
+                  entropy_coef=0.01, value_coef=0.5, seed=70,
+                  batch_size = 64, ppo_epochs = 4, lam = 0.95)
+    """
+    # Initialize CNN with a dummy observation (to get correct input shape)
+    obs, _ = env.reset()
+    dummy_obs_space = Box(low=0.0, high=1.0, shape=preprocess(obs).shape)
+    update_type = type
+    agent = Agent(obs_space=dummy_obs_space, action_space=env.action_space,
+                  hidden=64, lr=3e-4, gamma=0.99, clip_coef=0.2,
+                  entropy_coef=0.01, value_coef=0.5, seed=70,
+                  batch_size=64, ppo_epochs=4, lam=0.95, update_type=update_type)
+    """
+    # Stats for Return-Based Scaling only
+    # === Return-Based Scaling stats ===
+    r_mean, r_var = 0.0, 1e-8
+    g2_mean = 1.0
+    agent.r_var = r_var
+    agent.g2_mean = g2_mean
+    """
+    try:
+        obs, info = env.reset(seed=42)
+        state = preprocess(obs)
+        loss_history = []
+        reward_history = []
+        for update in range(1, batches + 1):
+            for t in range(steps):
+                action, logp, value = agent.choose_action(state)
+                next_obs, reward, terminated, truncated, info = env.step(action)
+                done = terminated or truncated
+                next_state = preprocess(next_obs)
+                agent.remember(state, action, reward, done, logp, value, next_state)
+                ep_return += reward
+                state = next_state
+                if done:
+                    episode += 1
+                    total_return += ep_return
+                    print(f"Episode {episode} return: {ep_return:.2f}")
+                    ep_return = 0
+                    obs, info = env.reset()
+                    state = preprocess(obs)
+            # Using reward gradient clipping
+            avg_loss = agent._update()
+            # Vanilla PPO (no normalization)
+            # avg_loss = agent.vanilla_ppo_update()
+            loss_history.append(avg_loss)
+            avg_ret = (total_return / episode) if episode else 0
+            reward_history.append(avg_ret)
+            print(f"Update {update}: episodes={episode}, avg_return={avg_ret:.2f}, avg_loss={avg_loss:.4f}")
+        plot.setPlot(loss_history, reward_history, agent.policy_loss_history, agent.value_loss_history, environ)
+    except Exception as e:
+        print(f"Error: {e}", file=sys.stderr)
+        return 1
+    finally:
+        avg = total_return / episode if episode else 0
+        print(f"\nEpisodes: {episode}, Avg return: {avg:.3f}")
+        env.close()
+    return 0
+def main() -> int:
+    list_env = ["ALE/Pacman-v5", "ALE/Gravitar-v5", "ALE/Boxing-v5"]
+    type_list = ["update_observation_norm", "update_advantage_norm",
+                 "update_return_norm", "vanilla_ppo_update"]
+    for env in list_env:
+        plot = PlotMultiple()
+        for type in type_list:
+            rl_model(type, plot, env)
+        plot.store(env)
+    return 0
+if __name__ == "__main__":
+    raise SystemExit(main())

Observation_Advantage_Norm_diff_hypo/Performance config for Learning Rate of update_advantage_norm.png ADDED Viewed

Observation_Advantage_Norm_diff_hypo/Performance config for Learning Rate of update_observation_norm.png ADDED Viewed

Observation_Advantage_Norm_diff_hypo/Performance config for Learning Rate of update_return_norm.png ADDED Viewed

Observation_Advantage_Norm_diff_hypo/Performance config for Learning Rate of vanilla_ppo_update.png ADDED Viewed

Observation_Advantage_Norm_diff_hypo/Performance config for entropy coefficient of update_advantage_norm.png ADDED Viewed

Observation_Advantage_Norm_diff_hypo/Performance config for entropy coefficient of update_observation_norm.png ADDED Viewed

Observation_Advantage_Norm_diff_hypo/Performance config for entropy coefficient of update_return_norm.png ADDED Viewed

Observation_Advantage_Norm_diff_hypo/Performance config for entropy coefficient of vanilla_ppo_update.png ADDED Viewed

Observation_Advantage_Norm_diff_hypo/Performance config for gamma value of update_advantage_norm.png ADDED Viewed

Observation_Advantage_Norm_diff_hypo/Performance config for gamma value of update_observation_norm.png ADDED Viewed

Observation_Advantage_Norm_diff_hypo/Performance config for gamma value of update_return_norm.png ADDED Viewed

Observation_Advantage_Norm_diff_hypo/Performance config for gamma value of vanilla_ppo_update.png ADDED Viewed

Observation_Advantage_Norm_diff_hypo/ppo__rew_norm_obs_diff_hyp.py ADDED Viewed

	@@ -0,0 +1,890 @@

+import numpy as np
+import torch as T
+import torch.nn as nn
+import torch.optim as optim
+from torch.distributions import Categorical
+class Agent:
+    def __init__(
+            self,
+            obs_space,
+            action_space,
+            hidden,
+            gamma,
+            clip_coef,
+            lr,
+            value_coef,
+            entropy_coef,
+            seed,
+            batch_size,
+            ppo_epochs,
+            lam,
+            update_type
+    ):
+        # Initialize seed for reproducibility
+        if seed is not None:
+            np.random.seed(seed)
+            T.manual_seed(seed)
+        """
+        # For flat observations (MLP model)
+        # Use GPU if available
+        self.device = T.device('cuda:0' if T.cuda.is_available() else 'cpu')
+        self.obs_dim = int(np.prod(getattr(obs_space, "shape", (obs_space,))))
+        self.action_dim = int(getattr(action_space, "n", action_space))
+        # Initialize the policy and the critic networks
+        self.policy = Policy(self.obs_dim, self.action_dim, hidden).to(self.device)
+        self.critic = Critic(self.obs_dim, hidden).to(self.device)
+        """
+        # Use GPU if available
+        self.device = T.device('cuda:0' if T.cuda.is_available() else 'cpu')
+        self.action_dim = int(getattr(action_space, "n", action_space))
+        self.update_type = update_type
+        # Initialize the policy and the critic networks
+        # Pass the shape tuple directly, not the flattened dimension.
+        self.policy = Policy(obs_space.shape, self.action_dim, hidden).to(self.device)
+        self.critic = Critic(obs_space.shape, hidden).to(self.device)
+        self.observeNorm = ObservationNorm()
+        self.advantageNorm = AdvantageNorm()
+        self.returnNorm = ReturnNorm()
+        # Set optimizer for policy and critic networks
+        self.opt = optim.Adam(
+            list(self.policy.parameters()) + list(self.critic.parameters()),
+            lr=lr
+        )
+        self.gamma = gamma
+        self.clip = clip_coef
+        self.value_coef = value_coef
+        self.entropy_coef = entropy_coef
+        self.sigma_history = []
+        self.loss_history = []
+        self.policy_loss_history = []
+        self.value_loss_history = []
+        self.entropy_history = []
+        self.lam = lam
+        self.ppo_epochs = ppo_epochs
+        self.batch_size = batch_size
+        self.memory = Memory()
+    """
+    # Choose action and remember for flat observations (MLP model)
+    def choose_action(self, observation):
+        # Returns: action, log probabilitiy, value of the state
+        state = T.as_tensor(observation, dtype=T.float32, device=self.device).view(-1)
+        with T.no_grad():
+            # Forward function (defined in Policy class)
+            dist = self.policy.next_action(state)
+            action = dist.sample()
+            logp = dist.log_prob(action)
+            value = self.critic.evaluated_state(state)
+        return int(action.item()), float(logp.item()), float(value.item())
+    def remember(self, state, action, reward, done, log_prob, value, next_state):
+        with T.no_grad():
+            # Pass on next state and have it evaluated by the critic network
+            ns = T.as_tensor(next_state, dtype=T.float32, device=self.device).view(-1)
+            next_value = self.critic.evaluated_state(ns).item()
+        self.memory.store(state, action, reward, done, log_prob, value, next_value)
+    """
+    # For CNN model
+    def choose_action(self, observation):
+        # Returns: action, log probabilitiy, value of the state
+        state = T.as_tensor(observation, dtype=T.float32, device=self.device) # Remove .view(-1)
+        with T.no_grad():
+            # Forward function (defined in Policy class)
+            dist = self.policy.next_action(state)
+            action = dist.sample()
+            logp = dist.log_prob(action)
+            value = self.critic.evaluated_state(state)
+        return int(action.item()), float(logp.item()), float(value.item())
+    def remember(self, state, action, reward, done, log_prob, value, next_state):
+        with T.no_grad():
+            # Pass on next state and have it evaluated by the critic network
+            ns = T.as_tensor(next_state, dtype=T.float32, device=self.device) # Remove .view(-1)
+            next_value = self.critic.evaluated_state(ns).item()
+        self.memory.store(state, action, reward, done, log_prob, value, next_value)
+    def _update(self):
+        if self.update_type == "update_observation_norm":
+            return self.update_observation_norm()
+        elif self.update_type == "update_advantage_norm":
+            return self.update_advantage_norm()
+        elif self.update_type == "update_return_norm":
+            return self.update_return_norm()
+        else:
+            return self.vanilla_ppo_update()
+    def vanilla_ppo_update(self):
+        if len(self.memory.states) == 0:
+            return 0.0
+        # Convert memory to tensors
+        states = T.as_tensor(np.array(self.memory.states), dtype=T.float32, device=self.device)
+        actions = T.as_tensor(self.memory.actions, dtype=T.long, device=self.device)
+        rewards = T.as_tensor(self.memory.rewards, dtype=T.float32, device=self.device)
+        dones = T.as_tensor(self.memory.dones, dtype=T.float32, device=self.device)
+        old_logp = T.as_tensor(self.memory.log_probs, dtype=T.float32, device=self.device)
+        values = T.as_tensor(self.memory.values, dtype=T.float32, device=self.device)
+        with T.no_grad():
+            # Compute next values (bootstrap for final step)
+            next_values = T.cat([values[1:], values[-1:].clone()])
+            deltas = rewards + self.gamma * next_values * (1 - dones) - values
+            # --- GAE-Lambda ---
+            adv = T.zeros_like(rewards)
+            gae = 0.0
+            for t in reversed(range(len(rewards))):
+                gae = deltas[t] + self.gamma * self.lam * (1 - dones[t]) * gae
+                adv[t] = gae
+            returns = adv + values
+            # Advantage normalization
+            adv = (adv - adv.mean()) / (adv.std(unbiased=False) + 1e-8)
+        # --- PPO Multiple Epochs + Minibatch ---
+        total_loss_epoch = 0.0
+        num_samples = len(states)
+        batch_size = min(64, num_samples)
+        ppo_epochs = 4
+        for _ in range(ppo_epochs):
+            # Shuffle indices
+            idxs = T.randperm(num_samples)
+            for start in range(0, num_samples, batch_size):
+                batch_idx = idxs[start:start + batch_size]
+                b_states = states[batch_idx]
+                b_actions = actions[batch_idx]
+                b_old_logp = old_logp[batch_idx]
+                b_returns = returns[batch_idx]
+                b_adv = adv[batch_idx]
+                dist = self.policy.next_action(b_states)
+                new_logp = dist.log_prob(b_actions)
+                entropy = dist.entropy().mean()
+                ratio = (new_logp - b_old_logp).exp()
+                # --- Clipped surrogate objective ---
+                surr1 = ratio * b_adv
+                surr2 = T.clamp(ratio, 1 - self.clip, 1 + self.clip) * b_adv
+                policy_loss = -T.min(surr1, surr2).mean()
+                # --- Critic loss ---
+                value_pred = self.critic.evaluated_state(b_states)
+                value_loss = 0.5 * (b_returns - value_pred).pow(2).mean()
+                # --- Total loss ---
+                total_loss = (
+                        policy_loss +
+                        self.value_coef * value_loss -
+                        self.entropy_coef * entropy
+                )
+                # Debug: track individual loss components
+                self.policy_loss_history.append(policy_loss.item())
+                self.value_loss_history.append(value_loss.item())
+                self.opt.zero_grad(set_to_none=True)
+                total_loss.backward()
+                self.opt.step()
+                total_loss_epoch += total_loss.item()
+        # Clear memory after full PPO update
+        self.memory.clear()
+        return total_loss_epoch / (ppo_epochs * (num_samples / batch_size))
+    def update_rbs(self):
+        if len(self.memory.states) == 0:
+            return 0.0
+        # Convert memory to tensors
+        states = T.as_tensor(np.array(self.memory.states), dtype=T.float32, device=self.device)
+        actions = T.as_tensor(self.memory.actions, dtype=T.long, device=self.device)
+        rewards = T.as_tensor(self.memory.rewards, dtype=T.float32, device=self.device)
+        dones = T.as_tensor(self.memory.dones, dtype=T.float32, device=self.device)
+        old_logp = T.as_tensor(self.memory.log_probs, dtype=T.float32, device=self.device)
+        values = T.as_tensor(self.memory.values, dtype=T.float32, device=self.device)
+        with T.no_grad():
+            # Compute next values (bootstrap for final step)
+            next_values = T.cat([values[1:], values[-1:].clone()])
+            deltas = rewards + self.gamma * next_values * (1 - dones) - values
+            # --- GAE-Lambda ---
+            adv = T.zeros_like(rewards)
+            gae = 0.0
+            for t in reversed(range(len(rewards))):
+                gae = deltas[t] + self.gamma * self.lam * (1 - dones[t]) * gae
+                adv[t] = gae
+            returns = adv + values
+            # --- Return-based normalization (RBS) ---
+            sigma_t = returns.std(unbiased=False) + 1e-8
+            returns = returns / sigma_t
+            self.sigma_history.append(sigma_t.item())
+            adv = adv / sigma_t
+            # Advantage normalization
+            adv = (adv - adv.mean()) / (adv.std(unbiased=False) + 1e-8)
+        # --- PPO Multiple Epochs + Minibatch ---
+        total_loss_epoch = 0.0
+        num_samples = len(states)
+        batch_size = min(64, num_samples)
+        ppo_epochs = 4
+        for _ in range(ppo_epochs):
+            # Shuffle indices
+            idxs = T.randperm(num_samples)
+            for start in range(0, num_samples, batch_size):
+                batch_idx = idxs[start:start + batch_size]
+                b_states = states[batch_idx]
+                b_actions = actions[batch_idx]
+                b_old_logp = old_logp[batch_idx]
+                b_returns = returns[batch_idx]
+                b_adv = adv[batch_idx]
+                dist = self.policy.next_action(b_states)
+                new_logp = dist.log_prob(b_actions)
+                entropy = dist.entropy().mean()
+                ratio = (new_logp - b_old_logp).exp()
+                # --- Clipped surrogate objective ---
+                surr1 = ratio * b_adv
+                surr2 = T.clamp(ratio, 1 - self.clip, 1 + self.clip) * b_adv
+                policy_loss = -T.min(surr1, surr2).mean()
+                # --- Critic loss ---
+                value_pred = self.critic.evaluated_state(b_states)
+                value_loss = 0.5 * (b_returns - value_pred).pow(2).mean()
+                # --- Total loss ---
+                total_loss = (
+                        policy_loss +
+                        self.value_coef * value_loss -
+                        self.entropy_coef * entropy
+                )
+                # Debug: track individual loss components
+                self.policy_loss_history.append(policy_loss.item())
+                self.value_loss_history.append(value_loss.item())
+                self.opt.zero_grad(set_to_none=True)
+                total_loss.backward()
+                self.opt.step()
+                total_loss_epoch += total_loss.item()
+        # Clear memory after full PPO update
+        self.memory.clear()
+        return total_loss_epoch / (ppo_epochs * (num_samples / batch_size))
+    def update_observation_norm(self):
+        if len(self.memory.states) == 0:
+            return 0.0
+        # Convert memory to tensors
+        states = T.as_tensor(np.array(self.memory.states), dtype=T.float32, device=self.device)
+        actions = T.as_tensor(self.memory.actions, dtype=T.long, device=self.device)
+        rewards = T.as_tensor(self.memory.rewards, dtype=T.float32, device=self.device)
+        dones = T.as_tensor(self.memory.dones, dtype=T.float32, device=self.device)
+        old_logp = T.as_tensor(self.memory.log_probs, dtype=T.float32, device=self.device)
+        values = T.as_tensor(self.memory.values, dtype=T.float32, device=self.device)
+        with T.no_grad():
+            # Compute next values (bootstrap for final step)
+            next_values = T.cat([values[1:], values[-1:].clone()])
+            deltas = rewards + self.gamma * next_values * (1 - dones) - values
+            # --- GAE-Lambda ---
+            adv = T.zeros_like(rewards)
+            gae = 0.0
+            for t in reversed(range(len(rewards))):
+                gae = deltas[t] + self.gamma * self.lam * (1 - dones[t]) * gae
+                adv[t] = gae
+            returns = adv + values
+            # --- observation normalization ---
+            self.observeNorm.update(states)
+            states = self.observeNorm.normalize(states)
+            # Advantage normalization
+            adv = (adv - adv.mean()) / (adv.std(unbiased=False) + 1e-8)
+        # --- PPO Multiple Epochs + Minibatch ---
+        total_loss_epoch = 0.0
+        num_samples = len(states)
+        batch_size = min(64, num_samples)
+        ppo_epochs = 4
+        for _ in range(ppo_epochs):
+            # Shuffle indices
+            idxs = T.randperm(num_samples)
+            for start in range(0, num_samples, batch_size):
+                batch_idx = idxs[start:start + batch_size]
+                b_states = states[batch_idx]
+                b_actions = actions[batch_idx]
+                b_old_logp = old_logp[batch_idx]
+                b_returns = returns[batch_idx]
+                b_adv = adv[batch_idx]
+                dist = self.policy.next_action(b_states)
+                new_logp = dist.log_prob(b_actions)
+                entropy = dist.entropy().mean()
+                ratio = (new_logp - b_old_logp).exp()
+                # --- Clipped surrogate objective ---
+                surr1 = ratio * b_adv
+                surr2 = T.clamp(ratio, 1 - self.clip, 1 + self.clip) * b_adv
+                policy_loss = -T.min(surr1, surr2).mean()
+                # --- Critic loss ---
+                value_pred = self.critic.evaluated_state(b_states)
+                value_loss = 0.5 * (b_returns - value_pred).pow(2).mean()
+                # --- Total loss ---
+                total_loss = (
+                        policy_loss +
+                        self.value_coef * value_loss -
+                        self.entropy_coef * entropy
+                )
+                # Debug: track individual loss components
+                self.policy_loss_history.append(policy_loss.item())
+                self.value_loss_history.append(value_loss.item())
+                self.opt.zero_grad(set_to_none=True)
+                total_loss.backward()
+                self.opt.step()
+                total_loss_epoch += total_loss.item()
+        # Clear memory after full PPO update
+        self.memory.clear()
+        return total_loss_epoch / (ppo_epochs * (num_samples / batch_size))
+    def update_advantage_norm(self):
+        if len(self.memory.states) == 0:
+            return 0.0
+        # Convert memory to tensors
+        states = T.as_tensor(np.array(self.memory.states), dtype=T.float32, device=self.device)
+        actions = T.as_tensor(self.memory.actions, dtype=T.long, device=self.device)
+        rewards = T.as_tensor(self.memory.rewards, dtype=T.float32, device=self.device)
+        dones = T.as_tensor(self.memory.dones, dtype=T.float32, device=self.device)
+        old_logp = T.as_tensor(self.memory.log_probs, dtype=T.float32, device=self.device)
+        values = T.as_tensor(self.memory.values, dtype=T.float32, device=self.device)
+        with T.no_grad():
+            # Compute next values (bootstrap for final step)
+            next_values = T.cat([values[1:], values[-1:].clone()])
+            deltas = rewards + self.gamma * next_values * (1 - dones) - values
+            # --- GAE-Lambda ---
+            adv = T.zeros_like(rewards)
+            gae = 0.0
+            for t in reversed(range(len(rewards))):
+                gae = deltas[t] + self.gamma * self.lam * (1 - dones[t]) * gae
+                adv[t] = gae
+            # --- Advantage normalization ---
+            returns = adv + values
+            self.advantageNorm.update(adv)
+            adv = self.observeNorm.normalize(adv)
+        # --- PPO Multiple Epochs + Minibatch ---
+        total_loss_epoch = 0.0
+        num_samples = len(states)
+        batch_size = min(64, num_samples)
+        ppo_epochs = 4
+        for _ in range(ppo_epochs):
+            # Shuffle indices
+            idxs = T.randperm(num_samples)
+            for start in range(0, num_samples, batch_size):
+                batch_idx = idxs[start:start + batch_size]
+                b_states = states[batch_idx]
+                b_actions = actions[batch_idx]
+                b_old_logp = old_logp[batch_idx]
+                b_returns = returns[batch_idx]
+                b_adv = adv[batch_idx]
+                dist = self.policy.next_action(b_states)
+                new_logp = dist.log_prob(b_actions)
+                entropy = dist.entropy().mean()
+                ratio = (new_logp - b_old_logp).exp()
+                # --- Clipped surrogate objective ---
+                surr1 = ratio * b_adv
+                surr2 = T.clamp(ratio, 1 - self.clip, 1 + self.clip) * b_adv
+                policy_loss = -T.min(surr1, surr2).mean()
+                # --- Critic loss ---
+                value_pred = self.critic.evaluated_state(b_states)
+                value_loss = 0.5 * (b_returns - value_pred).pow(2).mean()
+                # --- Total loss ---
+                total_loss = (
+                        policy_loss +
+                        self.value_coef * value_loss -
+                        self.entropy_coef * entropy
+                )
+                # Debug: track individual loss components
+                self.policy_loss_history.append(policy_loss.item())
+                self.value_loss_history.append(value_loss.item())
+                self.opt.zero_grad(set_to_none=True)
+                total_loss.backward()
+                self.opt.step()
+                total_loss_epoch += total_loss.item()
+        # Clear memory after full PPO update
+        self.memory.clear()
+        return total_loss_epoch / (ppo_epochs * (num_samples / batch_size))
+    def update_return_norm(self):
+        if len(self.memory.states) == 0:
+            return 0.0
+        # Convert memory to tensors
+        states = T.as_tensor(np.array(self.memory.states), dtype=T.float32, device=self.device)
+        actions = T.as_tensor(self.memory.actions, dtype=T.long, device=self.device)
+        rewards = T.as_tensor(self.memory.rewards, dtype=T.float32, device=self.device)
+        dones = T.as_tensor(self.memory.dones, dtype=T.float32, device=self.device)
+        old_logp = T.as_tensor(self.memory.log_probs, dtype=T.float32, device=self.device)
+        values = T.as_tensor(self.memory.values, dtype=T.float32, device=self.device)
+        with T.no_grad():
+            # Compute next values (bootstrap for final step)
+            next_values = T.cat([values[1:], values[-1:].clone()])
+            deltas = rewards + self.gamma * next_values * (1 - dones) - values
+            # --- GAE-Lambda ---
+            adv = T.zeros_like(rewards)
+            gae = 0.0
+            for t in reversed(range(len(rewards))):
+                gae = deltas[t] + self.gamma * self.lam * (1 - dones[t]) * gae
+                adv[t] = gae
+            returns = adv + values
+            # --- returns normalization ---
+            self.returnNorm.update(returns)
+            returns = self.returnNorm.normalize(returns)
+            # Advantage normalization
+            adv = (adv - adv.mean()) / (adv.std(unbiased=False) + 1e-8)
+        # --- PPO Multiple Epochs + Minibatch ---
+        total_loss_epoch = 0.0
+        num_samples = len(states)
+        batch_size = min(64, num_samples)
+        ppo_epochs = 4
+        for _ in range(ppo_epochs):
+            # Shuffle indices
+            idxs = T.randperm(num_samples)
+            for start in range(0, num_samples, batch_size):
+                batch_idx = idxs[start:start + batch_size]
+                b_states = states[batch_idx]
+                b_actions = actions[batch_idx]
+                b_old_logp = old_logp[batch_idx]
+                b_returns = returns[batch_idx]
+                b_adv = adv[batch_idx]
+                dist = self.policy.next_action(b_states)
+                new_logp = dist.log_prob(b_actions)
+                entropy = dist.entropy().mean()
+                ratio = (new_logp - b_old_logp).exp()
+                # --- Clipped surrogate objective ---
+                surr1 = ratio * b_adv
+                surr2 = T.clamp(ratio, 1 - self.clip, 1 + self.clip) * b_adv
+                policy_loss = -T.min(surr1, surr2).mean()
+                # --- Critic loss ---
+                value_pred = self.critic.evaluated_state(b_states)
+                value_loss = 0.5 * (b_returns - value_pred).pow(2).mean()
+                # --- Total loss ---
+                total_loss = (
+                        policy_loss +
+                        self.value_coef * value_loss -
+                        self.entropy_coef * entropy
+                )
+                # Debug: track individual loss components
+                self.policy_loss_history.append(policy_loss.item())
+                self.value_loss_history.append(value_loss.item())
+                self.opt.zero_grad(set_to_none=True)
+                total_loss.backward()
+                self.opt.step()
+                total_loss_epoch += total_loss.item()
+        # Clear memory after full PPO update
+        self.memory.clear()
+        return total_loss_epoch / (ppo_epochs * (num_samples / batch_size))
+    def update_reward_gradient_clipping(self):
+        if len(self.memory.states) == 0:
+            return 0.0
+        # Convert memory to tensors
+        states = T.as_tensor(np.array(self.memory.states), dtype=T.float32, device=self.device)
+        actions = T.as_tensor(self.memory.actions, dtype=T.long, device=self.device)
+        rewards = T.as_tensor(self.memory.rewards, dtype=T.float32, device=self.device)
+        dones = T.as_tensor(self.memory.dones, dtype=T.float32, device=self.device)
+        old_logp = T.as_tensor(self.memory.log_probs, dtype=T.float32, device=self.device)
+        values = T.as_tensor(self.memory.values, dtype=T.float32, device=self.device)
+        # Reward clipping
+        rewards = T.clamp(rewards, -1, 1)
+        with T.no_grad():
+            # Compute next values (bootstrap for final step)
+            next_values = T.cat([values[1:], values[-1:].clone()])
+            deltas = rewards + self.gamma * next_values * (1 - dones) - values
+            # --- GAE-Lambda ---
+            adv = T.zeros_like(rewards)
+            gae = 0.0
+            for t in reversed(range(len(rewards))):
+                gae = deltas[t] + self.gamma * self.lam * (1 - dones[t]) * gae
+                adv[t] = gae
+            returns = adv + values
+            # Advantage normalization
+            adv = (adv - adv.mean()) / (adv.std(unbiased=False) + 1e-8)
+        # --- PPO Multiple Epochs + Minibatch ---
+        total_loss_epoch = 0.0
+        num_samples = len(states)
+        batch_size = min(64, num_samples)
+        ppo_epochs = 4
+        for _ in range(ppo_epochs):
+            # Shuffle indices
+            idxs = T.randperm(num_samples)
+            for start in range(0, num_samples, batch_size):
+                batch_idx = idxs[start:start + batch_size]
+                b_states = states[batch_idx]
+                b_actions = actions[batch_idx]
+                b_old_logp = old_logp[batch_idx]
+                b_returns = returns[batch_idx]
+                b_adv = adv[batch_idx]
+                dist = self.policy.next_action(b_states)
+                new_logp = dist.log_prob(b_actions)
+                entropy = dist.entropy().mean()
+                ratio = (new_logp - b_old_logp).exp()
+                # --- Clipped surrogate objective ---
+                surr1 = ratio * b_adv
+                surr2 = T.clamp(ratio, 1 - self.clip, 1 + self.clip) * b_adv
+                policy_loss = -T.min(surr1, surr2).mean()
+                # --- Critic loss ---
+                value_pred = self.critic.evaluated_state(b_states)
+                value_loss = 0.5 * (b_returns - value_pred).pow(2).mean()
+                # --- Total loss ---
+                total_loss = (
+                        policy_loss +
+                        self.value_coef * value_loss -
+                        self.entropy_coef * entropy
+                )
+                # Debug: track individual loss components
+                self.policy_loss_history.append(policy_loss.item())
+                self.value_loss_history.append(value_loss.item())
+                self.opt.zero_grad(set_to_none=True)
+                total_loss.backward()
+                T.nn.utils.clip_grad_norm_(list(self.policy.parameters()) + list(self.critic.parameters()), 0.5)
+                self.opt.step()
+                total_loss_epoch += total_loss.item()
+        # Clear memory after full PPO update
+        self.memory.clear()
+        return total_loss_epoch / (ppo_epochs * (num_samples / batch_size))
+"""
+# Policy network (simple MLP, flattened observations)
+class Policy(nn.Module):
+    def __init__(self, obs_dim: int, action_dim: int, hidden: int):
+        super().__init__()
+        self.net = nn.Sequential(
+            nn.Linear(obs_dim, hidden),
+            nn.ReLU(),
+            nn.Linear(hidden, hidden),
+            nn.ReLU(),
+            nn.Linear(hidden, action_dim)
+        )
+    def next_action(self, state: T.Tensor) -> Categorical:
+        # Returns the probability distribution over actions
+        if state.dim() == 1:
+            state = state.unsqueeze(0)
+        state = state.view(state.size(0), -1)
+        return Categorical(logits=self.net(state))
+"""
+# Policy network (CNN)
+class Policy(nn.Module):
+    def __init__(self, obs_shape: tuple, action_dim: int, hidden: int):
+        super().__init__()
+        c, h, w = obs_shape
+        # Suggested architecture for Atari: https://arxiv.org/pdf/1312.5602
+        self.cnn = nn.Sequential(
+            nn.Conv2d(c, 16, kernel_size=8, stride=4),
+            nn.ReLU(),
+            nn.Conv2d(16, 32, kernel_size=4, stride=2),
+            nn.ReLU(),
+            nn.Flatten()
+        )
+        with T.no_grad():
+            cnn_output_dim = self.cnn(T.zeros(1, c, h, w)).shape[1]
+        self.net = nn.Sequential(
+            nn.Linear(cnn_output_dim, hidden),
+            nn.ReLU(),
+            nn.Linear(hidden, action_dim)
+        )
+    def next_action(self, state: T.Tensor) -> Categorical:
+        # Returns the probability distribution over actions
+        if state.dim() == 3:
+            state = state.unsqueeze(0)
+        cnn_out = self.cnn(state)
+        return Categorical(logits=self.net(cnn_out))
+"""
+# Critic network (simple MLP, flattened observations)
+class Critic(nn.Module):
+    def __init__(self, obs_dim: int, hidden: int):
+        super().__init__()
+        self.net = nn.Sequential(
+            nn.Linear(obs_dim, hidden),
+            nn.ReLU(),
+            nn.Linear(hidden, hidden),
+            nn.ReLU(),
+            nn.Linear(hidden, 1)
+        )
+    def evaluated_state(self, x: T.Tensor) -> T.Tensor:
+        if x.dim() == 1:
+            x = x.unsqueeze(0)
+        x = x.view(x.size(0), -1)
+        return self.net(x).squeeze(-1)
+"""
+# Critic network (CNN)
+class Critic(nn.Module):
+    def __init__(self, obs_shape: tuple, hidden: int):
+        super().__init__()
+        c, h, w = obs_shape
+        # Suggested architecture for Atari: https://arxiv.org/pdf/1312.5602
+        self.cnn = nn.Sequential(
+            nn.Conv2d(c, 16, kernel_size=8, stride=4),
+            nn.ReLU(),
+            nn.Conv2d(16, 32, kernel_size=4, stride=2),
+            nn.ReLU(),
+            nn.Flatten()
+        )
+        with T.no_grad():
+            cnn_output_dim = self.cnn(T.zeros(1, c, h, w)).shape[1]
+        self.net = nn.Sequential(
+            nn.Linear(cnn_output_dim, hidden),
+            nn.ReLU(),
+            nn.Linear(hidden, 1)
+        )
+    def evaluated_state(self, x: T.Tensor) -> T.Tensor:
+        if x.dim() == 3:
+            x = x.unsqueeze(0)
+        cnn_out = self.cnn(x)
+        return self.net(cnn_out).squeeze(-1)
+class Memory():
+    def __init__(self):
+        self.states = []
+        self.actions = []
+        self.rewards = []
+        self.dones = []
+        self.log_probs = []
+        self.values = []
+        self.next_values = []
+    def store(self, state, action, reward, done, log_prob, value, next_value):
+        self.states.append(np.asarray(state, dtype=np.float32))
+        self.actions.append(int(action))
+        self.rewards.append(float(reward))
+        self.dones.append(float(done))
+        self.log_probs.append(float(log_prob))
+        self.values.append(float(value))
+        self.next_values.append(float(next_value))
+    """
+    # For mini-batch updates? To be implemented
+    def start_batch(self, batch_size: int):
+        n_states = len(self.states)
+        starts = np.arange(0, n_states, batch_size)
+        index = np.arange(n_states, dtype=np.int64)
+        np.random.shuffle(index)
+        return [index[s:s + batch_size] for s in starts]
+    """
+    def clear(self):
+        self.states = []
+        self.actions = []
+        self.rewards = []
+        self.dones = []
+        self.log_probs = []
+        self.values = []
+        self.next_values = []
+class ObservationNorm:
+    def __init__(self):
+        self.main_mean = 0
+        self.main_var = 0
+        self.count = 1e-4
+    def update(self, x: T.Tensor):
+        batch_mean = T.mean(x, dim=0)
+        batch_var = T.var(x, dim=0)
+        batch_count = x.shape[0]
+        self._update_from_moments(batch_mean, batch_var, batch_count)
+    def _update_from_moments(self, batch_mean, batch_var, batch_count):
+        delta = batch_mean - self.main_mean
+        tot_count = self.count + batch_count
+        new_mean = self.main_mean + delta * batch_count / tot_count #Update the running mean
+        m_a = self.main_var * self.count
+        m_b = batch_var * batch_count
+        M2 = m_a + m_b + np.square(delta) * self.count * batch_count / tot_count
+        new_var = M2 / tot_count # update the running variance
+        self.main_mean, self.main_var, self.count = new_mean, new_var, tot_count
+    def normalize(self, x):
+        return (x - self.main_mean) / (np.sqrt(self.main_var) + 1e-8) # We add epsilon to make sure that we don't
+                                                            # divide through zero.
+class AdvantageNorm:
+    '''
+    This class implements the Advantage Normalization. The purpose is to normalize either across batches or
+    only within the same batch.
+    '''
+    def __init__(self):
+        self.main_mean = 0
+        self.main_var = 0
+        self.count = 1e-4
+    def update(self, x: T.Tensor):
+        batch_mean = T.mean(x, dim=0)
+        batch_var = T.var(x, dim=0)
+        batch_count = x.shape[0]
+        self._update_from_moments(batch_mean, batch_var, batch_count)
+    def _update_from_moments(self, batch_mean, batch_var, batch_count):
+        delta = batch_mean - self.main_mean
+        tot_count = self.count + batch_count
+        new_mean = self.main_mean + delta * batch_count / tot_count #Update the running mean
+        m_a = self.main_var * self.count
+        m_b = batch_var * batch_count
+        M2 = m_a + m_b + np.square(delta) * self.count * batch_count / tot_count
+        new_var = M2 / tot_count # update the running variance
+        self.main_mean, self.main_var, self.count = new_mean, new_var, tot_count
+    def normalize(self, x):
+        return (x - self.main_mean) / (np.sqrt(self.main_var) + 1e-8) # We add epsilon to make sure that we don't
+                                                            # divide through zero.
+class ReturnNorm:
+    '''
+    This class implements the Advantage Normalization. The purpose is to normalize either across batches or
+    only within the same batch.
+    '''
+    def __init__(self):
+        self.main_mean = 0
+        self.main_var = 0
+        self.count = 1e-4
+    def update(self, x: T.Tensor):
+        batch_mean = T.mean(x, dim=0)
+        batch_var = T.var(x, dim=0)
+        batch_count = x.shape[0]
+        self._update_from_moments(batch_mean, batch_var, batch_count)
+    def _update_from_moments(self, batch_mean, batch_var, batch_count):
+        delta = batch_mean - self.main_mean
+        tot_count = self.count + batch_count
+        new_mean = self.main_mean + delta * batch_count / tot_count #Update the running mean
+        m_a = self.main_var * self.count
+        m_b = batch_var * batch_count
+        M2 = m_a + m_b + np.square(delta) * self.count * batch_count / tot_count
+        new_var = M2 / tot_count # update the running variance
+        self.main_mean, self.main_var, self.count = new_mean, new_var, tot_count
+    def normalize(self, x):
+        return (x - self.main_mean) / (np.sqrt(self.main_var) + 1e-8) # We add epsilon to make sure that we don't
+                                                            # divide through zero.

Observation_Advantage_Norm/PPO_environment.py → Observation_Advantage_Norm_diff_hypo/ppo_rew_norm_obs_env_diff_hypo.py RENAMED Viewed

@@ -1,43 +1,75 @@
-import ale_py
 import gymnasium as gym
 import sys
-import numpy as np
-from PPO_Obser_Adva_Norm import *
 import matplotlib.pyplot as plt
 def preprocess(obs):
-    # Flatten and normalize uint8 frames to float32 in [0,1]
-    return (obs.astype(np.float32).ravel() / 255.0)
-def main() -> int:
-    # Initialize environment
-    env = gym.make("ALE/Pacman-v5", render_mode="human")  # consider removing render_mode for training speed
-    # Initialize variables
     episode = 0
     total_return = 0
     ep_return = 0
-    steps = 1000 # Batch of 100, 1000 environment steps per update
-    batches = 15
-    mode = "clip"
-    average_return = []
-    total_loss = []
-    updates =  []
-    activate_observation_norm = True
-    activate_advantage_norm = False
-    # Inspect spaces
     print("Observation space:", env.observation_space)
     print("Action space:", env.action_space)
-    # Create PPO Agent (adapted to ppo_helpers_v2.Agent signature)
-    agent = Agent(obs_space=env.observation_space, action_space=env.action_space, hidden=64,
-                  lr=3e-4, gamma=0.99, clip_coef=0.2, entropy_coef=0, value_coef=0.5, seed=70)
     try:
         obs, info = env.reset(seed=42)
         state = preprocess(obs)
         for update in range(1, batches + 1):
             for t in range(steps):
                 action, logp, value = agent.choose_action(state)
@@ -58,12 +90,21 @@ def main() -> int:
                     obs, info = env.reset()
                     state = preprocess(obs)
-            agent._update(mode, activate_observation_norm, activate_advantage_norm)
-            avg_ret = (total_return  / episode) if episode else 0
-            average_return.append(avg_ret)
-            total_loss.append(float(agent.total_loss.detach().cpu().item()))
-            updates.append(update)
-            print(f"Update {update}: episodes={episode}, avg_return={avg_ret:.2f}")
     except Exception as e:
         print(f"Error: {e}", file=sys.stderr)
@@ -72,22 +113,46 @@ def main() -> int:
         avg = total_return / episode if episode else 0
         print(f"\nEpisodes: {episode}, Avg return: {avg:.3f}")
         env.close()
-        plt.plot(updates, average_return, label="average return")
-        plt.plot(updates, total_loss, label="total loss")
-        if activate_advantage_norm:
-            plt.title("Average return vs. total loss with advantage norm")
-        elif activate_observation_norm:
-            plt.title("Average return vs. total loss with observation norm")
-        elif activate_advantage_norm and activate_observation_norm:
-            plt.title("Average return vs. total loss with observation norm and advantage norm")
-        else:
-            plt.title("Average return vs. total loss with no normalization")
-        plt.xlabel("updates")
-        plt.ylabel("average return/total loss")
-        plt.legend()
-        plt.show()
     return 0
 if __name__ == "__main__":
-    raise SystemExit(main())

 import gymnasium as gym
 import sys
 import matplotlib.pyplot as plt
+import ale_py
+from ppo__rew_norm_obs_diff_hyp import *
+from gymnasium.spaces import Box
+import cv2
+import matplotlib.pyplot as plt
 def preprocess(obs):
+    # Convert to grayscale
+    obs = cv2.cvtColor(obs, cv2.COLOR_RGB2GRAY)
+    # Resize
+    obs = cv2.resize(obs, (84, 84), interpolation=cv2.INTER_AREA)
+    # Add channel dimension and normalize
+    return np.expand_dims(obs, axis=0).astype(np.float32) / 255.0
+def rl_model(type, gamma = 0.99, clip_coef = 0.2,
+             lr = 1e-3, ent_coef = 0.01):
+    # env = gym.make("ALE/SpaceInvaders-v5", render_mode='human')
+    # env = gym.make("ALE/Pacman-v5", render_mode="human")
+    env = gym.make("ALE/Pacman-v5")
     episode = 0
     total_return = 0
     ep_return = 0
+    steps = 1000
+    batches = 100
     print("Observation space:", env.observation_space)
     print("Action space:", env.action_space)
+    """
+    agent = Agent(obs_space=env.observation_space, action_space=env.action_space,
+                  hidden=64, lr=3e-4, gamma=0.99, clip_coef=0.2,
+                  entropy_coef=0.01, value_coef=0.5, seed=70,
+                  batch_size = 64, ppo_epochs = 4, lam = 0.95)
+    """
+    # Initialize CNN with a dummy observation (to get correct input shape)
+    obs, _ = env.reset()
+    dummy_obs_space = Box(low=0.0, high=1.0, shape=preprocess(obs).shape)
+    update_type = type
+    agent = Agent(obs_space=dummy_obs_space, action_space=env.action_space,
+                  hidden=64, lr= lr, gamma= gamma, clip_coef= clip_coef,
+                  entropy_coef= ent_coef, value_coef=0.5, seed=70,
+                  batch_size=64, ppo_epochs=4, lam=0.95, update_type=update_type)
+    """
+    # Stats for Return-Based Scaling only
+    # === Return-Based Scaling stats ===
+    r_mean, r_var = 0.0, 1e-8
+    g2_mean = 1.0
+    agent.r_var = r_var
+    agent.g2_mean = g2_mean
+    """
     try:
         obs, info = env.reset(seed=42)
         state = preprocess(obs)
+        loss_history = []
+        reward_history = []
+        labels = []
+        final_scores = []
         for update in range(1, batches + 1):
             for t in range(steps):
                 action, logp, value = agent.choose_action(state)
                     obs, info = env.reset()
                     state = preprocess(obs)
+            # Using reward gradient clipping
+            avg_loss = agent._update()
+            # Vanilla PPO (no normalization)
+            # avg_loss = agent.vanilla_ppo_update()
+            loss_history.append(avg_loss)
+            avg_ret = (total_return / episode) if episode else 0
+            reward_history.append(avg_ret)
+            print(f"Update {update}: episodes={episode}, avg_return={avg_ret:.2f}, avg_loss={avg_loss:.4f}")
+        return reward_history, loss_history
     except Exception as e:
         print(f"Error: {e}", file=sys.stderr)
         avg = total_return / episode if episode else 0
         print(f"\nEpisodes: {episode}, Avg return: {avg:.3f}")
         env.close()
+    return 0
+def createHisto(x, final_scores, labels, title):
+    plt.figure(figsize=(10, 6))      # ← NEW FIGURE
+    plt.bar(x, final_scores)
+    plt.xticks(x, labels, rotation=45, ha="right")
+    plt.ylabel("Mean Reward")
+    plt.title(title)
+    plt.tight_layout()
+    plt.savefig(title + ".png")
+    plt.close()
+def main() -> int:
+    type_list = ["update_observation_norm","update_advantage_norm",
+                 "update_return_norm", "vanilla_ppo_update"]
+    learning_rates = [1e-2, 1e-3, 1e-4]
+    clip_coefs = [0.01, 0.1, 0.3 ]
+    gamma_list = [0.99, 0.97, 0.95]
+    entropy_coefs_list = [0.1, 0.01, 0.001]
+    final_scores = []
+    labels = ["entropy coef. = " + str(entrop_ceof) for entrop_ceof in entropy_coefs_list]
+    for update_type in type_list:
+        final_scores = []
+        for entrop_ceof in entropy_coefs_list:
+          reward_history, loss_history =  rl_model(update_type,  ent_coef = entrop_ceof  )
+          final_scores.append(np.mean(reward_history))
+        createHisto(np.arange(len(labels)), final_scores, labels, "Performance config for entropy coefficient of " + update_type)
     return 0
 if __name__ == "__main__":
+    raise SystemExit(main())

Observation_Advantage_Norm_in_batch/ppo__rew_norm_obs_in_batch.py ADDED Viewed

	@@ -0,0 +1,829 @@

+import numpy as np
+import torch as T
+import torch.nn as nn
+import torch.optim as optim
+from torch.distributions import Categorical
+class Agent:
+    def __init__(
+            self,
+            obs_space,
+            action_space,
+            hidden,
+            gamma,
+            clip_coef,
+            lr,
+            value_coef,
+            entropy_coef,
+            seed,
+            batch_size,
+            ppo_epochs,
+            lam,
+            update_type
+    ):
+        # Initialize seed for reproducibility
+        if seed is not None:
+            np.random.seed(seed)
+            T.manual_seed(seed)
+        """
+        # For flat observations (MLP model)
+        # Use GPU if available
+        self.device = T.device('cuda:0' if T.cuda.is_available() else 'cpu')
+        self.obs_dim = int(np.prod(getattr(obs_space, "shape", (obs_space,))))
+        self.action_dim = int(getattr(action_space, "n", action_space))
+        # Initialize the policy and the critic networks
+        self.policy = Policy(self.obs_dim, self.action_dim, hidden).to(self.device)
+        self.critic = Critic(self.obs_dim, hidden).to(self.device)
+        """
+        # Use GPU if available
+        self.device = T.device('cuda:0' if T.cuda.is_available() else 'cpu')
+        self.action_dim = int(getattr(action_space, "n", action_space))
+        self.update_type = update_type
+        # Initialize the policy and the critic networks
+        # Pass the shape tuple directly, not the flattened dimension.
+        self.policy = Policy(obs_space.shape, self.action_dim, hidden).to(self.device)
+        self.critic = Critic(obs_space.shape, hidden).to(self.device)
+        self.observeNorm = ObservationNorm()
+        self.advantageNorm = AdvantageNorm()
+        self.returnNorm = ReturnNorm()
+        # Set optimizer for policy and critic networks
+        self.opt = optim.Adam(
+            list(self.policy.parameters()) + list(self.critic.parameters()),
+            lr=lr
+        )
+        self.gamma = gamma
+        self.clip = clip_coef
+        self.value_coef = value_coef
+        self.entropy_coef = entropy_coef
+        self.sigma_history = []
+        self.loss_history = []
+        self.policy_loss_history = []
+        self.value_loss_history = []
+        self.entropy_history = []
+        self.lam = lam
+        self.ppo_epochs = ppo_epochs
+        self.batch_size = batch_size
+        self.memory = Memory()
+    """
+    # Choose action and remember for flat observations (MLP model)
+    def choose_action(self, observation):
+        # Returns: action, log probabilitiy, value of the state
+        state = T.as_tensor(observation, dtype=T.float32, device=self.device).view(-1)
+        with T.no_grad():
+            # Forward function (defined in Policy class)
+            dist = self.policy.next_action(state)
+            action = dist.sample()
+            logp = dist.log_prob(action)
+            value = self.critic.evaluated_state(state)
+        return int(action.item()), float(logp.item()), float(value.item())
+    def remember(self, state, action, reward, done, log_prob, value, next_state):
+        with T.no_grad():
+            # Pass on next state and have it evaluated by the critic network
+            ns = T.as_tensor(next_state, dtype=T.float32, device=self.device).view(-1)
+            next_value = self.critic.evaluated_state(ns).item()
+        self.memory.store(state, action, reward, done, log_prob, value, next_value)
+    """
+    # For CNN model
+    def choose_action(self, observation):
+        # Returns: action, log probabilitiy, value of the state
+        state = T.as_tensor(observation, dtype=T.float32, device=self.device) # Remove .view(-1)
+        with T.no_grad():
+            # Forward function (defined in Policy class)
+            dist = self.policy.next_action(state)
+            action = dist.sample()
+            logp = dist.log_prob(action)
+            value = self.critic.evaluated_state(state)
+        return int(action.item()), float(logp.item()), float(value.item())
+    def remember(self, state, action, reward, done, log_prob, value, next_state):
+        with T.no_grad():
+            # Pass on next state and have it evaluated by the critic network
+            ns = T.as_tensor(next_state, dtype=T.float32, device=self.device) # Remove .view(-1)
+            next_value = self.critic.evaluated_state(ns).item()
+        self.memory.store(state, action, reward, done, log_prob, value, next_value)
+    def _update(self):
+        if self.update_type == "update_observation_norm":
+            return self.update_observation_norm()
+        elif self.update_type == "update_advantage_norm":
+            return self.update_advantage_norm()
+        elif self.update_type == "update_return_norm":
+            return self.update_return_norm()
+        else:
+            return self.vanilla_ppo_update()
+    def vanilla_ppo_update(self):
+        if len(self.memory.states) == 0:
+            return 0.0
+        # Convert memory to tensors
+        states = T.as_tensor(np.array(self.memory.states), dtype=T.float32, device=self.device)
+        actions = T.as_tensor(self.memory.actions, dtype=T.long, device=self.device)
+        rewards = T.as_tensor(self.memory.rewards, dtype=T.float32, device=self.device)
+        dones = T.as_tensor(self.memory.dones, dtype=T.float32, device=self.device)
+        old_logp = T.as_tensor(self.memory.log_probs, dtype=T.float32, device=self.device)
+        values = T.as_tensor(self.memory.values, dtype=T.float32, device=self.device)
+        with T.no_grad():
+            # Compute next values (bootstrap for final step)
+            next_values = T.cat([values[1:], values[-1:].clone()])
+            deltas = rewards + self.gamma * next_values * (1 - dones) - values
+            # --- GAE-Lambda ---
+            adv = T.zeros_like(rewards)
+            gae = 0.0
+            for t in reversed(range(len(rewards))):
+                gae = deltas[t] + self.gamma * self.lam * (1 - dones[t]) * gae
+                adv[t] = gae
+            returns = adv + values
+            # Advantage normalization
+            adv = (adv - adv.mean()) / (adv.std(unbiased=False) + 1e-8)
+        # --- PPO Multiple Epochs + Minibatch ---
+        total_loss_epoch = 0.0
+        num_samples = len(states)
+        batch_size = min(64, num_samples)
+        ppo_epochs = 4
+        for _ in range(ppo_epochs):
+            # Shuffle indices
+            idxs = T.randperm(num_samples)
+            for start in range(0, num_samples, batch_size):
+                batch_idx = idxs[start:start + batch_size]
+                b_states = states[batch_idx]
+                b_actions = actions[batch_idx]
+                b_old_logp = old_logp[batch_idx]
+                b_returns = returns[batch_idx]
+                b_adv = adv[batch_idx]
+                dist = self.policy.next_action(b_states)
+                new_logp = dist.log_prob(b_actions)
+                entropy = dist.entropy().mean()
+                ratio = (new_logp - b_old_logp).exp()
+                # --- Clipped surrogate objective ---
+                surr1 = ratio * b_adv
+                surr2 = T.clamp(ratio, 1 - self.clip, 1 + self.clip) * b_adv
+                policy_loss = -T.min(surr1, surr2).mean()
+                # --- Critic loss ---
+                value_pred = self.critic.evaluated_state(b_states)
+                value_loss = 0.5 * (b_returns - value_pred).pow(2).mean()
+                # --- Total loss ---
+                total_loss = (
+                        policy_loss +
+                        self.value_coef * value_loss -
+                        self.entropy_coef * entropy
+                )
+                # Debug: track individual loss components
+                self.policy_loss_history.append(policy_loss.item())
+                self.value_loss_history.append(value_loss.item())
+                self.opt.zero_grad(set_to_none=True)
+                total_loss.backward()
+                self.opt.step()
+                total_loss_epoch += total_loss.item()
+        # Clear memory after full PPO update
+        self.memory.clear()
+        return total_loss_epoch / (ppo_epochs * (num_samples / batch_size))
+    def update_rbs(self):
+        if len(self.memory.states) == 0:
+            return 0.0
+        # Convert memory to tensors
+        states = T.as_tensor(np.array(self.memory.states), dtype=T.float32, device=self.device)
+        actions = T.as_tensor(self.memory.actions, dtype=T.long, device=self.device)
+        rewards = T.as_tensor(self.memory.rewards, dtype=T.float32, device=self.device)
+        dones = T.as_tensor(self.memory.dones, dtype=T.float32, device=self.device)
+        old_logp = T.as_tensor(self.memory.log_probs, dtype=T.float32, device=self.device)
+        values = T.as_tensor(self.memory.values, dtype=T.float32, device=self.device)
+        with T.no_grad():
+            # Compute next values (bootstrap for final step)
+            next_values = T.cat([values[1:], values[-1:].clone()])
+            deltas = rewards + self.gamma * next_values * (1 - dones) - values
+            # --- GAE-Lambda ---
+            adv = T.zeros_like(rewards)
+            gae = 0.0
+            for t in reversed(range(len(rewards))):
+                gae = deltas[t] + self.gamma * self.lam * (1 - dones[t]) * gae
+                adv[t] = gae
+            returns = adv + values
+            # --- Return-based normalization (RBS) ---
+            sigma_t = returns.std(unbiased=False) + 1e-8
+            returns = returns / sigma_t
+            self.sigma_history.append(sigma_t.item())
+            adv = adv / sigma_t
+            # Advantage normalization
+            adv = (adv - adv.mean()) / (adv.std(unbiased=False) + 1e-8)
+        # --- PPO Multiple Epochs + Minibatch ---
+        total_loss_epoch = 0.0
+        num_samples = len(states)
+        batch_size = min(64, num_samples)
+        ppo_epochs = 4
+        for _ in range(ppo_epochs):
+            # Shuffle indices
+            idxs = T.randperm(num_samples)
+            for start in range(0, num_samples, batch_size):
+                batch_idx = idxs[start:start + batch_size]
+                b_states = states[batch_idx]
+                b_actions = actions[batch_idx]
+                b_old_logp = old_logp[batch_idx]
+                b_returns = returns[batch_idx]
+                b_adv = adv[batch_idx]
+                dist = self.policy.next_action(b_states)
+                new_logp = dist.log_prob(b_actions)
+                entropy = dist.entropy().mean()
+                ratio = (new_logp - b_old_logp).exp()
+                # --- Clipped surrogate objective ---
+                surr1 = ratio * b_adv
+                surr2 = T.clamp(ratio, 1 - self.clip, 1 + self.clip) * b_adv
+                policy_loss = -T.min(surr1, surr2).mean()
+                # --- Critic loss ---
+                value_pred = self.critic.evaluated_state(b_states)
+                value_loss = 0.5 * (b_returns - value_pred).pow(2).mean()
+                # --- Total loss ---
+                total_loss = (
+                        policy_loss +
+                        self.value_coef * value_loss -
+                        self.entropy_coef * entropy
+                )
+                # Debug: track individual loss components
+                self.policy_loss_history.append(policy_loss.item())
+                self.value_loss_history.append(value_loss.item())
+                self.opt.zero_grad(set_to_none=True)
+                total_loss.backward()
+                self.opt.step()
+                total_loss_epoch += total_loss.item()
+        # Clear memory after full PPO update
+        self.memory.clear()
+        return total_loss_epoch / (ppo_epochs * (num_samples / batch_size))
+    def update_observation_norm(self):
+        if len(self.memory.states) == 0:
+            return 0.0
+        # Convert memory to tensors
+        states = T.as_tensor(np.array(self.memory.states), dtype=T.float32, device=self.device)
+        actions = T.as_tensor(self.memory.actions, dtype=T.long, device=self.device)
+        rewards = T.as_tensor(self.memory.rewards, dtype=T.float32, device=self.device)
+        dones = T.as_tensor(self.memory.dones, dtype=T.float32, device=self.device)
+        old_logp = T.as_tensor(self.memory.log_probs, dtype=T.float32, device=self.device)
+        values = T.as_tensor(self.memory.values, dtype=T.float32, device=self.device)
+        with T.no_grad():
+            # Compute next values (bootstrap for final step)
+            next_values = T.cat([values[1:], values[-1:].clone()])
+            deltas = rewards + self.gamma * next_values * (1 - dones) - values
+            # --- GAE-Lambda ---
+            adv = T.zeros_like(rewards)
+            gae = 0.0
+            for t in reversed(range(len(rewards))):
+                gae = deltas[t] + self.gamma * self.lam * (1 - dones[t]) * gae
+                adv[t] = gae
+            returns = adv + values
+            # --- observation normalization ---
+            states = self.observeNorm.normalize(states)
+            # Advantage normalization
+           # adv = (adv - adv.mean()) / (adv.std(unbiased=False) + 1e-8)
+        # --- PPO Multiple Epochs + Minibatch ---
+        total_loss_epoch = 0.0
+        num_samples = len(states)
+        batch_size = min(64, num_samples)
+        ppo_epochs = 4
+        for _ in range(ppo_epochs):
+            # Shuffle indices
+            idxs = T.randperm(num_samples)
+            for start in range(0, num_samples, batch_size):
+                batch_idx = idxs[start:start + batch_size]
+                b_states = states[batch_idx]
+                b_actions = actions[batch_idx]
+                b_old_logp = old_logp[batch_idx]
+                b_returns = returns[batch_idx]
+                b_adv = adv[batch_idx]
+                dist = self.policy.next_action(b_states)
+                new_logp = dist.log_prob(b_actions)
+                entropy = dist.entropy().mean()
+                ratio = (new_logp - b_old_logp).exp()
+                # --- Clipped surrogate objective ---
+                surr1 = ratio * b_adv
+                surr2 = T.clamp(ratio, 1 - self.clip, 1 + self.clip) * b_adv
+                policy_loss = -T.min(surr1, surr2).mean()
+                # --- Critic loss ---
+                value_pred = self.critic.evaluated_state(b_states)
+                value_loss = 0.5 * (b_returns - value_pred).pow(2).mean()
+                # --- Total loss ---
+                total_loss = (
+                        policy_loss +
+                        self.value_coef * value_loss -
+                        self.entropy_coef * entropy
+                )
+                # Debug: track individual loss components
+                self.policy_loss_history.append(policy_loss.item())
+                self.value_loss_history.append(value_loss.item())
+                self.opt.zero_grad(set_to_none=True)
+                total_loss.backward()
+                self.opt.step()
+                total_loss_epoch += total_loss.item()
+        # Clear memory after full PPO update
+        self.memory.clear()
+        return total_loss_epoch / (ppo_epochs * (num_samples / batch_size))
+    def update_advantage_norm(self):
+        if len(self.memory.states) == 0:
+            return 0.0
+        # Convert memory to tensors
+        states = T.as_tensor(np.array(self.memory.states), dtype=T.float32, device=self.device)
+        actions = T.as_tensor(self.memory.actions, dtype=T.long, device=self.device)
+        rewards = T.as_tensor(self.memory.rewards, dtype=T.float32, device=self.device)
+        dones = T.as_tensor(self.memory.dones, dtype=T.float32, device=self.device)
+        old_logp = T.as_tensor(self.memory.log_probs, dtype=T.float32, device=self.device)
+        values = T.as_tensor(self.memory.values, dtype=T.float32, device=self.device)
+        with T.no_grad():
+            # Compute next values (bootstrap for final step)
+            next_values = T.cat([values[1:], values[-1:].clone()])
+            deltas = rewards + self.gamma * next_values * (1 - dones) - values
+            # --- GAE-Lambda ---
+            adv = T.zeros_like(rewards)
+            gae = 0.0
+            for t in reversed(range(len(rewards))):
+                gae = deltas[t] + self.gamma * self.lam * (1 - dones[t]) * gae
+                adv[t] = gae
+            # --- Advantage normalization ---
+            returns = adv + values
+            adv = self.advantageNorm.normalize(adv)
+        # --- PPO Multiple Epochs + Minibatch ---
+        total_loss_epoch = 0.0
+        num_samples = len(states)
+        batch_size = min(64, num_samples)
+        ppo_epochs = 4
+        for _ in range(ppo_epochs):
+            # Shuffle indices
+            idxs = T.randperm(num_samples)
+            for start in range(0, num_samples, batch_size):
+                batch_idx = idxs[start:start + batch_size]
+                b_states = states[batch_idx]
+                b_actions = actions[batch_idx]
+                b_old_logp = old_logp[batch_idx]
+                b_returns = returns[batch_idx]
+                b_adv = adv[batch_idx]
+                dist = self.policy.next_action(b_states)
+                new_logp = dist.log_prob(b_actions)
+                entropy = dist.entropy().mean()
+                ratio = (new_logp - b_old_logp).exp()
+                # --- Clipped surrogate objective ---
+                surr1 = ratio * b_adv
+                surr2 = T.clamp(ratio, 1 - self.clip, 1 + self.clip) * b_adv
+                policy_loss = -T.min(surr1, surr2).mean()
+                # --- Critic loss ---
+                value_pred = self.critic.evaluated_state(b_states)
+                value_loss = 0.5 * (b_returns - value_pred).pow(2).mean()
+                # --- Total loss ---
+                total_loss = (
+                        policy_loss +
+                        self.value_coef * value_loss -
+                        self.entropy_coef * entropy
+                )
+                # Debug: track individual loss components
+                self.policy_loss_history.append(policy_loss.item())
+                self.value_loss_history.append(value_loss.item())
+                self.opt.zero_grad(set_to_none=True)
+                total_loss.backward()
+                self.opt.step()
+                total_loss_epoch += total_loss.item()
+        # Clear memory after full PPO update
+        self.memory.clear()
+        return total_loss_epoch / (ppo_epochs * (num_samples / batch_size))
+    def update_return_norm(self):
+        if len(self.memory.states) == 0:
+            return 0.0
+        # Convert memory to tensors
+        states = T.as_tensor(np.array(self.memory.states), dtype=T.float32, device=self.device)
+        actions = T.as_tensor(self.memory.actions, dtype=T.long, device=self.device)
+        rewards = T.as_tensor(self.memory.rewards, dtype=T.float32, device=self.device)
+        dones = T.as_tensor(self.memory.dones, dtype=T.float32, device=self.device)
+        old_logp = T.as_tensor(self.memory.log_probs, dtype=T.float32, device=self.device)
+        values = T.as_tensor(self.memory.values, dtype=T.float32, device=self.device)
+        with T.no_grad():
+            # Compute next values (bootstrap for final step)
+            next_values = T.cat([values[1:], values[-1:].clone()])
+            deltas = rewards + self.gamma * next_values * (1 - dones) - values
+            # --- GAE-Lambda ---
+            adv = T.zeros_like(rewards)
+            gae = 0.0
+            for t in reversed(range(len(rewards))):
+                gae = deltas[t] + self.gamma * self.lam * (1 - dones[t]) * gae
+                adv[t] = gae
+            returns = adv + values
+            # --- returns normalization ---
+            returns = self.returnNorm.normalize(returns)
+            # Advantage normalization
+            #adv = (adv - adv.mean()) / (adv.std(unbiased=False) + 1e-8)
+        # --- PPO Multiple Epochs + Minibatch ---
+        total_loss_epoch = 0.0
+        num_samples = len(states)
+        batch_size = min(64, num_samples)
+        ppo_epochs = 4
+        for _ in range(ppo_epochs):
+            # Shuffle indices
+            idxs = T.randperm(num_samples)
+            for start in range(0, num_samples, batch_size):
+                batch_idx = idxs[start:start + batch_size]
+                b_states = states[batch_idx]
+                b_actions = actions[batch_idx]
+                b_old_logp = old_logp[batch_idx]
+                b_returns = returns[batch_idx]
+                b_adv = adv[batch_idx]
+                dist = self.policy.next_action(b_states)
+                new_logp = dist.log_prob(b_actions)
+                entropy = dist.entropy().mean()
+                ratio = (new_logp - b_old_logp).exp()
+                # --- Clipped surrogate objective ---
+                surr1 = ratio * b_adv
+                surr2 = T.clamp(ratio, 1 - self.clip, 1 + self.clip) * b_adv
+                policy_loss = -T.min(surr1, surr2).mean()
+                # --- Critic loss ---
+                value_pred = self.critic.evaluated_state(b_states)
+                value_loss = 0.5 * (b_returns - value_pred).pow(2).mean()
+                # --- Total loss ---
+                total_loss = (
+                        policy_loss +
+                        self.value_coef * value_loss -
+                        self.entropy_coef * entropy
+                )
+                # Debug: track individual loss components
+                self.policy_loss_history.append(policy_loss.item())
+                self.value_loss_history.append(value_loss.item())
+                self.opt.zero_grad(set_to_none=True)
+                total_loss.backward()
+                self.opt.step()
+                total_loss_epoch += total_loss.item()
+        # Clear memory after full PPO update
+        self.memory.clear()
+        return total_loss_epoch / (ppo_epochs * (num_samples / batch_size))
+    def update_reward_gradient_clipping(self):
+        if len(self.memory.states) == 0:
+            return 0.0
+        # Convert memory to tensors
+        states = T.as_tensor(np.array(self.memory.states), dtype=T.float32, device=self.device)
+        actions = T.as_tensor(self.memory.actions, dtype=T.long, device=self.device)
+        rewards = T.as_tensor(self.memory.rewards, dtype=T.float32, device=self.device)
+        dones = T.as_tensor(self.memory.dones, dtype=T.float32, device=self.device)
+        old_logp = T.as_tensor(self.memory.log_probs, dtype=T.float32, device=self.device)
+        values = T.as_tensor(self.memory.values, dtype=T.float32, device=self.device)
+        # Reward clipping
+        rewards = T.clamp(rewards, -1, 1)
+        with T.no_grad():
+            # Compute next values (bootstrap for final step)
+            next_values = T.cat([values[1:], values[-1:].clone()])
+            deltas = rewards + self.gamma * next_values * (1 - dones) - values
+            # --- GAE-Lambda ---
+            adv = T.zeros_like(rewards)
+            gae = 0.0
+            for t in reversed(range(len(rewards))):
+                gae = deltas[t] + self.gamma * self.lam * (1 - dones[t]) * gae
+                adv[t] = gae
+            returns = adv + values
+            # Advantage normalization
+            adv = (adv - adv.mean()) / (adv.std(unbiased=False) + 1e-8)
+        # --- PPO Multiple Epochs + Minibatch ---
+        total_loss_epoch = 0.0
+        num_samples = len(states)
+        batch_size = min(64, num_samples)
+        ppo_epochs = 4
+        for _ in range(ppo_epochs):
+            # Shuffle indices
+            idxs = T.randperm(num_samples)
+            for start in range(0, num_samples, batch_size):
+                batch_idx = idxs[start:start + batch_size]
+                b_states = states[batch_idx]
+                b_actions = actions[batch_idx]
+                b_old_logp = old_logp[batch_idx]
+                b_returns = returns[batch_idx]
+                b_adv = adv[batch_idx]
+                dist = self.policy.next_action(b_states)
+                new_logp = dist.log_prob(b_actions)
+                entropy = dist.entropy().mean()
+                ratio = (new_logp - b_old_logp).exp()
+                # --- Clipped surrogate objective ---
+                surr1 = ratio * b_adv
+                surr2 = T.clamp(ratio, 1 - self.clip, 1 + self.clip) * b_adv
+                policy_loss = -T.min(surr1, surr2).mean()
+                # --- Critic loss ---
+                value_pred = self.critic.evaluated_state(b_states)
+                value_loss = 0.5 * (b_returns - value_pred).pow(2).mean()
+                # --- Total loss ---
+                total_loss = (
+                        policy_loss +
+                        self.value_coef * value_loss -
+                        self.entropy_coef * entropy
+                )
+                # Debug: track individual loss components
+                self.policy_loss_history.append(policy_loss.item())
+                self.value_loss_history.append(value_loss.item())
+                self.opt.zero_grad(set_to_none=True)
+                total_loss.backward()
+                T.nn.utils.clip_grad_norm_(list(self.policy.parameters()) + list(self.critic.parameters()), 0.5)
+                self.opt.step()
+                total_loss_epoch += total_loss.item()
+        # Clear memory after full PPO update
+        self.memory.clear()
+        return total_loss_epoch / (ppo_epochs * (num_samples / batch_size))
+"""
+# Policy network (simple MLP, flattened observations)
+class Policy(nn.Module):
+    def __init__(self, obs_dim: int, action_dim: int, hidden: int):
+        super().__init__()
+        self.net = nn.Sequential(
+            nn.Linear(obs_dim, hidden),
+            nn.ReLU(),
+            nn.Linear(hidden, hidden),
+            nn.ReLU(),
+            nn.Linear(hidden, action_dim)
+        )
+    def next_action(self, state: T.Tensor) -> Categorical:
+        # Returns the probability distribution over actions
+        if state.dim() == 1:
+            state = state.unsqueeze(0)
+        state = state.view(state.size(0), -1)
+        return Categorical(logits=self.net(state))
+"""
+# Policy network (CNN)
+class Policy(nn.Module):
+    def __init__(self, obs_shape: tuple, action_dim: int, hidden: int):
+        super().__init__()
+        c, h, w = obs_shape
+        # Suggested architecture for Atari: https://arxiv.org/pdf/1312.5602
+        self.cnn = nn.Sequential(
+            nn.Conv2d(c, 16, kernel_size=8, stride=4),
+            nn.ReLU(),
+            nn.Conv2d(16, 32, kernel_size=4, stride=2),
+            nn.ReLU(),
+            nn.Flatten()
+        )
+        with T.no_grad():
+            cnn_output_dim = self.cnn(T.zeros(1, c, h, w)).shape[1]
+        self.net = nn.Sequential(
+            nn.Linear(cnn_output_dim, hidden),
+            nn.ReLU(),
+            nn.Linear(hidden, action_dim)
+        )
+    def next_action(self, state: T.Tensor) -> Categorical:
+        # Returns the probability distribution over actions
+        if state.dim() == 3:
+            state = state.unsqueeze(0)
+        cnn_out = self.cnn(state)
+        return Categorical(logits=self.net(cnn_out))
+"""
+# Critic network (simple MLP, flattened observations)
+class Critic(nn.Module):
+    def __init__(self, obs_dim: int, hidden: int):
+        super().__init__()
+        self.net = nn.Sequential(
+            nn.Linear(obs_dim, hidden),
+            nn.ReLU(),
+            nn.Linear(hidden, hidden),
+            nn.ReLU(),
+            nn.Linear(hidden, 1)
+        )
+    def evaluated_state(self, x: T.Tensor) -> T.Tensor:
+        if x.dim() == 1:
+            x = x.unsqueeze(0)
+        x = x.view(x.size(0), -1)
+        return self.net(x).squeeze(-1)
+"""
+# Critic network (CNN)
+class Critic(nn.Module):
+    def __init__(self, obs_shape: tuple, hidden: int):
+        super().__init__()
+        c, h, w = obs_shape
+        # Suggested architecture for Atari: https://arxiv.org/pdf/1312.5602
+        self.cnn = nn.Sequential(
+            nn.Conv2d(c, 16, kernel_size=8, stride=4),
+            nn.ReLU(),
+            nn.Conv2d(16, 32, kernel_size=4, stride=2),
+            nn.ReLU(),
+            nn.Flatten()
+        )
+        with T.no_grad():
+            cnn_output_dim = self.cnn(T.zeros(1, c, h, w)).shape[1]
+        self.net = nn.Sequential(
+            nn.Linear(cnn_output_dim, hidden),
+            nn.ReLU(),
+            nn.Linear(hidden, 1)
+        )
+    def evaluated_state(self, x: T.Tensor) -> T.Tensor:
+        if x.dim() == 3:
+            x = x.unsqueeze(0)
+        cnn_out = self.cnn(x)
+        return self.net(cnn_out).squeeze(-1)
+class Memory():
+    def __init__(self):
+        self.states = []
+        self.actions = []
+        self.rewards = []
+        self.dones = []
+        self.log_probs = []
+        self.values = []
+        self.next_values = []
+    def store(self, state, action, reward, done, log_prob, value, next_value):
+        self.states.append(np.asarray(state, dtype=np.float32))
+        self.actions.append(int(action))
+        self.rewards.append(float(reward))
+        self.dones.append(float(done))
+        self.log_probs.append(float(log_prob))
+        self.values.append(float(value))
+        self.next_values.append(float(next_value))
+    """
+    # For mini-batch updates? To be implemented
+    def start_batch(self, batch_size: int):
+        n_states = len(self.states)
+        starts = np.arange(0, n_states, batch_size)
+        index = np.arange(n_states, dtype=np.int64)
+        np.random.shuffle(index)
+        return [index[s:s + batch_size] for s in starts]
+    """
+    def clear(self):
+        self.states = []
+        self.actions = []
+        self.rewards = []
+        self.dones = []
+        self.log_probs = []
+        self.values = []
+        self.next_values = []
+class ObservationNorm:
+    def normalize(self, x):
+        return (x - x.mean()) / (x.std(unbiased=False) + 1e-8)  # We add epsilon to make sure that we don't
+        # divide through zero.
+class AdvantageNorm:
+    '''
+    This class implements the Advantage Normalization. The purpose is to normalize either across batches or
+    only within the same batch.
+    '''
+    def normalize(self, x):
+        return (x - x.mean()) / (x.std(unbiased=False) + 1e-8) # We add epsilon to make sure that we don't
+                                                            # divide through zero.
+class ReturnNorm:
+    '''
+    This class implements the Advantage Normalization. The purpose is to normalize either across batches or
+    only within the same batch.
+    '''
+    def normalize(self, x):
+        return (x - x.mean()) / (x.std(unbiased=False) + 1e-8)
+        # We add epsilon to make sure that we don't
+                                                            # divide through zero.

Observation_Advantage_Norm_in_batch/ppo_rew_norm_obs_env_in_batch.py ADDED Viewed

	@@ -0,0 +1,163 @@

+import gymnasium as gym
+import sys
+import matplotlib.pyplot as plt
+import ale_py
+from ppo__rew_norm_obs_in_batch import *
+from gymnasium.spaces import Box
+import cv2
+def preprocess(obs):
+    # Convert to grayscale
+    obs = cv2.cvtColor(obs, cv2.COLOR_RGB2GRAY)
+    # Resize
+    obs = cv2.resize(obs, (84, 84), interpolation=cv2.INTER_AREA)
+    # Add channel dimension and normalize
+    return np.expand_dims(obs, axis=0).astype(np.float32) / 255.0
+def rl_model(type):
+    # env = gym.make("ALE/SpaceInvaders-v5", render_mode='human')
+    # env = gym.make("ALE/Pacman-v5", render_mode="human")
+    env = gym.make("ALE/Pacman-v5")
+    episode = 0
+    total_return = 0
+    ep_return = 0
+    steps = 1000
+    batches = 100
+    print("Observation space:", env.observation_space)
+    print("Action space:", env.action_space)
+    """
+    agent = Agent(obs_space=env.observation_space, action_space=env.action_space,
+                  hidden=64, lr=3e-4, gamma=0.99, clip_coef=0.2,
+                  entropy_coef=0.01, value_coef=0.5, seed=70,
+                  batch_size = 64, ppo_epochs = 4, lam = 0.95)
+    """
+    # Initialize CNN with a dummy observation (to get correct input shape)
+    obs, _ = env.reset()
+    dummy_obs_space = Box(low=0.0, high=1.0, shape=preprocess(obs).shape)
+    update_type = type
+    agent = Agent(obs_space=dummy_obs_space, action_space=env.action_space,
+                  hidden=64, lr=3e-4, gamma=0.99, clip_coef=0.2,
+                  entropy_coef=0.01, value_coef=0.5, seed=70,
+                  batch_size=64, ppo_epochs=4, lam=0.95, update_type=update_type)
+    """
+    # Stats for Return-Based Scaling only
+    # === Return-Based Scaling stats ===
+    r_mean, r_var = 0.0, 1e-8
+    g2_mean = 1.0
+    agent.r_var = r_var
+    agent.g2_mean = g2_mean
+    """
+    try:
+        obs, info = env.reset(seed=42)
+        state = preprocess(obs)
+        loss_history = []
+        reward_history = []
+        for update in range(1, batches + 1):
+            for t in range(steps):
+                action, logp, value = agent.choose_action(state)
+                next_obs, reward, terminated, truncated, info = env.step(action)
+                done = terminated or truncated
+                next_state = preprocess(next_obs)
+                agent.remember(state, action, reward, done, logp, value, next_state)
+                ep_return += reward
+                state = next_state
+                if done:
+                    episode += 1
+                    total_return += ep_return
+                    print(f"Episode {episode} return: {ep_return:.2f}")
+                    ep_return = 0
+                    obs, info = env.reset()
+                    state = preprocess(obs)
+            # Using reward gradient clipping
+            avg_loss = agent._update()
+            # Vanilla PPO (no normalization)
+            # avg_loss = agent.vanilla_ppo_update()
+            loss_history.append(avg_loss)
+            avg_ret = (total_return / episode) if episode else 0
+            reward_history.append(avg_ret)
+            print(f"Update {update}: episodes={episode}, avg_return={avg_ret:.2f}, avg_loss={avg_loss:.4f}")
+        fig = plt.figure(figsize=(12, 8))
+        """
+        # Plot for Return-Based Scaling only
+        ax1 = plt.subplot(220)
+        ax1.plot(agent.sigma_history, label="Return σ")
+        ax1.set_xlabel("PPO Update")
+        ax1.set_ylabel("σ (Return Std)")
+        """
+        ax2 = plt.subplot(221)
+        ax2.plot(loss_history, label="Avg Loss")
+        ax2.set_ylabel("Average PPO Loss")
+        ax2.set_xlabel("PPO Update")
+        ax3 = plt.subplot(222)
+        ax3.plot(reward_history, label="Reward")
+        ax3.set_ylabel("Reward")
+        ax3.set_xlabel("PPO Update")
+        # Details about value loss and policy loss
+        ax4 = plt.subplot(223)
+        ax4.plot(agent.policy_loss_history, label="Policy Loss", alpha=0.7)
+        ax4.set_ylabel("Policy Loss")
+        ax4.set_xlabel("Training Step")
+        ax4.legend()
+        ax5 = plt.subplot(224)
+        ax5.plot(agent.value_loss_history, label="Value Loss", alpha=0.7)
+        ax5.set_ylabel("Value Loss")
+        ax5.set_xlabel("Training Step")
+        ax5.legend()
+        fig.suptitle("PPO Training Stability of type " + update_type +
+                     "-in_batch")
+        fig.tight_layout()
+        plt.savefig(type +"_in_batch.png")
+    except Exception as e:
+        print(f"Error: {e}", file=sys.stderr)
+        return 1
+    finally:
+        avg = total_return / episode if episode else 0
+        print(f"\nEpisodes: {episode}, Avg return: {avg:.3f}")
+        env.close()
+    return 0
+def main() -> int:
+    type_list = ["update_observation_norm", "update_advantage_norm", "update_return_norm", "vanilla_ppo_update"]
+    for type in type_list:
+        rl_model(type)
+    return 0
+if __name__ == "__main__":
+    raise SystemExit(main())

Observation_Advantage_Norm_in_batch/update_advantage_norm_in_batch.png ADDED Viewed

Observation_Advantage_Norm_in_batch/update_observation_norm_in_batch.png ADDED Viewed

Observation_Advantage_Norm_in_batch/update_return_norm_in_batch.png ADDED Viewed

Observation_Advantage_Norm_in_batch/vanilla_ppo_update_in_batch.png ADDED Viewed

Observation_Advantage_Norm_running_averages/ppo__rew_norm_obs_running_average.py ADDED Viewed

	@@ -0,0 +1,893 @@

+import numpy as np
+import torch as T
+import torch.nn as nn
+import torch.optim as optim
+from torch.distributions import Categorical
+class Agent:
+    def __init__(
+            self,
+            obs_space,
+            action_space,
+            hidden,
+            gamma,
+            clip_coef,
+            lr,
+            value_coef,
+            entropy_coef,
+            seed,
+            batch_size,
+            ppo_epochs,
+            lam,
+            update_type
+    ):
+        # Initialize seed for reproducibility
+        if seed is not None:
+            np.random.seed(seed)
+            T.manual_seed(seed)
+        """
+        # For flat observations (MLP model)
+        # Use GPU if available
+        self.device = T.device('cuda:0' if T.cuda.is_available() else 'cpu')
+        self.obs_dim = int(np.prod(getattr(obs_space, "shape", (obs_space,))))
+        self.action_dim = int(getattr(action_space, "n", action_space))
+        # Initialize the policy and the critic networks
+        self.policy = Policy(self.obs_dim, self.action_dim, hidden).to(self.device)
+        self.critic = Critic(self.obs_dim, hidden).to(self.device)
+        """
+        # Use GPU if available
+        self.device = T.device('cuda:0' if T.cuda.is_available() else 'cpu')
+        self.action_dim = int(getattr(action_space, "n", action_space))
+        self.update_type = update_type
+        # Initialize the policy and the critic networks
+        # Pass the shape tuple directly, not the flattened dimension.
+        self.policy = Policy(obs_space.shape, self.action_dim, hidden).to(self.device)
+        self.critic = Critic(obs_space.shape, hidden).to(self.device)
+        self.observeNorm = ObservationNorm()
+        self.advantageNorm = AdvantageNorm()
+        self.returnNorm = ReturnNorm()
+        # Set optimizer for policy and critic networks
+        self.opt = optim.Adam(
+            list(self.policy.parameters()) + list(self.critic.parameters()),
+            lr=lr
+        )
+        self.gamma = gamma
+        self.clip = clip_coef
+        self.value_coef = value_coef
+        self.entropy_coef = entropy_coef
+        self.sigma_history = []
+        self.loss_history = []
+        self.policy_loss_history = []
+        self.value_loss_history = []
+        self.entropy_history = []
+        self.lam = lam
+        self.ppo_epochs = ppo_epochs
+        self.batch_size = batch_size
+        self.memory = Memory()
+    """
+    # Choose action and remember for flat observations (MLP model)
+    def choose_action(self, observation):
+        # Returns: action, log probabilitiy, value of the state
+        state = T.as_tensor(observation, dtype=T.float32, device=self.device).view(-1)
+        with T.no_grad():
+            # Forward function (defined in Policy class)
+            dist = self.policy.next_action(state)
+            action = dist.sample()
+            logp = dist.log_prob(action)
+            value = self.critic.evaluated_state(state)
+        return int(action.item()), float(logp.item()), float(value.item())
+    def remember(self, state, action, reward, done, log_prob, value, next_state):
+        with T.no_grad():
+            # Pass on next state and have it evaluated by the critic network
+            ns = T.as_tensor(next_state, dtype=T.float32, device=self.device).view(-1)
+            next_value = self.critic.evaluated_state(ns).item()
+        self.memory.store(state, action, reward, done, log_prob, value, next_value)
+    """
+    # For CNN model
+    def choose_action(self, observation):
+        # Returns: action, log probabilitiy, value of the state
+        state = T.as_tensor(observation, dtype=T.float32, device=self.device) # Remove .view(-1)
+        with T.no_grad():
+            # Forward function (defined in Policy class)
+            dist = self.policy.next_action(state)
+            action = dist.sample()
+            logp = dist.log_prob(action)
+            value = self.critic.evaluated_state(state)
+        return int(action.item()), float(logp.item()), float(value.item())
+    def remember(self, state, action, reward, done, log_prob, value, next_state):
+        with T.no_grad():
+            # Pass on next state and have it evaluated by the critic network
+            ns = T.as_tensor(next_state, dtype=T.float32, device=self.device) # Remove .view(-1)
+            next_value = self.critic.evaluated_state(ns).item()
+        self.memory.store(state, action, reward, done, log_prob, value, next_value)
+    def _update(self):
+        if self.update_type == "update_observation_norm":
+            return self.update_observation_norm()
+        elif self.update_type == "update_advantage_norm":
+            return self.update_advantage_norm()
+        elif self.update_type == "update_return_norm":
+            return self.update_return_norm()
+        else:
+            return self.vanilla_ppo_update()
+    def vanilla_ppo_update(self):
+        if len(self.memory.states) == 0:
+            return 0.0
+        # Convert memory to tensors
+        states = T.as_tensor(np.array(self.memory.states), dtype=T.float32, device=self.device)
+        actions = T.as_tensor(self.memory.actions, dtype=T.long, device=self.device)
+        rewards = T.as_tensor(self.memory.rewards, dtype=T.float32, device=self.device)
+        dones = T.as_tensor(self.memory.dones, dtype=T.float32, device=self.device)
+        old_logp = T.as_tensor(self.memory.log_probs, dtype=T.float32, device=self.device)
+        values = T.as_tensor(self.memory.values, dtype=T.float32, device=self.device)
+        with T.no_grad():
+            # Compute next values (bootstrap for final step)
+            next_values = T.cat([values[1:], values[-1:].clone()])
+            deltas = rewards + self.gamma * next_values * (1 - dones) - values
+            # --- GAE-Lambda ---
+            adv = T.zeros_like(rewards)
+            gae = 0.0
+            for t in reversed(range(len(rewards))):
+                gae = deltas[t] + self.gamma * self.lam * (1 - dones[t]) * gae
+                adv[t] = gae
+            returns = adv + values
+            # Advantage normalization
+            adv = (adv - adv.mean()) / (adv.std(unbiased=False) + 1e-8)
+        # --- PPO Multiple Epochs + Minibatch ---
+        total_loss_epoch = 0.0
+        num_samples = len(states)
+        batch_size = min(64, num_samples)
+        ppo_epochs = 4
+        for _ in range(ppo_epochs):
+            # Shuffle indices
+            idxs = T.randperm(num_samples)
+            for start in range(0, num_samples, batch_size):
+                batch_idx = idxs[start:start + batch_size]
+                b_states = states[batch_idx]
+                b_actions = actions[batch_idx]
+                b_old_logp = old_logp[batch_idx]
+                b_returns = returns[batch_idx]
+                b_adv = adv[batch_idx]
+                dist = self.policy.next_action(b_states)
+                new_logp = dist.log_prob(b_actions)
+                entropy = dist.entropy().mean()
+                ratio = (new_logp - b_old_logp).exp()
+                # --- Clipped surrogate objective ---
+                surr1 = ratio * b_adv
+                surr2 = T.clamp(ratio, 1 - self.clip, 1 + self.clip) * b_adv
+                policy_loss = -T.min(surr1, surr2).mean()
+                # --- Critic loss ---
+                value_pred = self.critic.evaluated_state(b_states)
+                value_loss = 0.5 * (b_returns - value_pred).pow(2).mean()
+                # --- Total loss ---
+                total_loss = (
+                        policy_loss +
+                        self.value_coef * value_loss -
+                        self.entropy_coef * entropy
+                )
+                # Debug: track individual loss components
+                self.policy_loss_history.append(policy_loss.item())
+                self.value_loss_history.append(value_loss.item())
+                self.opt.zero_grad(set_to_none=True)
+                total_loss.backward()
+                self.opt.step()
+                total_loss_epoch += total_loss.item()
+        # Clear memory after full PPO update
+        self.memory.clear()
+        return total_loss_epoch / (ppo_epochs * (num_samples / batch_size))
+    def update_rbs(self):
+        if len(self.memory.states) == 0:
+            return 0.0
+        # Convert memory to tensors
+        states = T.as_tensor(np.array(self.memory.states), dtype=T.float32, device=self.device)
+        actions = T.as_tensor(self.memory.actions, dtype=T.long, device=self.device)
+        rewards = T.as_tensor(self.memory.rewards, dtype=T.float32, device=self.device)
+        dones = T.as_tensor(self.memory.dones, dtype=T.float32, device=self.device)
+        old_logp = T.as_tensor(self.memory.log_probs, dtype=T.float32, device=self.device)
+        values = T.as_tensor(self.memory.values, dtype=T.float32, device=self.device)
+        with T.no_grad():
+            # Compute next values (bootstrap for final step)
+            next_values = T.cat([values[1:], values[-1:].clone()])
+            deltas = rewards + self.gamma * next_values * (1 - dones) - values
+            # --- GAE-Lambda ---
+            adv = T.zeros_like(rewards)
+            gae = 0.0
+            for t in reversed(range(len(rewards))):
+                gae = deltas[t] + self.gamma * self.lam * (1 - dones[t]) * gae
+                adv[t] = gae
+            returns = adv + values
+            # --- Return-based normalization (RBS) ---
+            sigma_t = returns.std(unbiased=False) + 1e-8
+            returns = returns / sigma_t
+            self.sigma_history.append(sigma_t.item())
+            adv = adv / sigma_t
+            # Advantage normalization
+            adv = (adv - adv.mean()) / (adv.std(unbiased=False) + 1e-8)
+        # --- PPO Multiple Epochs + Minibatch ---
+        total_loss_epoch = 0.0
+        num_samples = len(states)
+        batch_size = min(64, num_samples)
+        ppo_epochs = 4
+        for _ in range(ppo_epochs):
+            # Shuffle indices
+            idxs = T.randperm(num_samples)
+            for start in range(0, num_samples, batch_size):
+                batch_idx = idxs[start:start + batch_size]
+                b_states = states[batch_idx]
+                b_actions = actions[batch_idx]
+                b_old_logp = old_logp[batch_idx]
+                b_returns = returns[batch_idx]
+                b_adv = adv[batch_idx]
+                dist = self.policy.next_action(b_states)
+                new_logp = dist.log_prob(b_actions)
+                entropy = dist.entropy().mean()
+                ratio = (new_logp - b_old_logp).exp()
+                # --- Clipped surrogate objective ---
+                surr1 = ratio * b_adv
+                surr2 = T.clamp(ratio, 1 - self.clip, 1 + self.clip) * b_adv
+                policy_loss = -T.min(surr1, surr2).mean()
+                # --- Critic loss ---
+                value_pred = self.critic.evaluated_state(b_states)
+                value_loss = 0.5 * (b_returns - value_pred).pow(2).mean()
+                # --- Total loss ---
+                total_loss = (
+                        policy_loss +
+                        self.value_coef * value_loss -
+                        self.entropy_coef * entropy
+                )
+                # Debug: track individual loss components
+                self.policy_loss_history.append(policy_loss.item())
+                self.value_loss_history.append(value_loss.item())
+                self.opt.zero_grad(set_to_none=True)
+                total_loss.backward()
+                self.opt.step()
+                total_loss_epoch += total_loss.item()
+        # Clear memory after full PPO update
+        self.memory.clear()
+        return total_loss_epoch / (ppo_epochs * (num_samples / batch_size))
+    def update_observation_norm(self):
+        if len(self.memory.states) == 0:
+            return 0.0
+        # Convert memory to tensors
+        states = T.as_tensor(np.array(self.memory.states), dtype=T.float32, device=self.device)
+        actions = T.as_tensor(self.memory.actions, dtype=T.long, device=self.device)
+        rewards = T.as_tensor(self.memory.rewards, dtype=T.float32, device=self.device)
+        dones = T.as_tensor(self.memory.dones, dtype=T.float32, device=self.device)
+        old_logp = T.as_tensor(self.memory.log_probs, dtype=T.float32, device=self.device)
+        values = T.as_tensor(self.memory.values, dtype=T.float32, device=self.device)
+        with T.no_grad():
+            # Compute next values (bootstrap for final step)
+            next_values = T.cat([values[1:], values[-1:].clone()])
+            deltas = rewards + self.gamma * next_values * (1 - dones) - values
+            # --- GAE-Lambda ---
+            adv = T.zeros_like(rewards)
+            gae = 0.0
+            for t in reversed(range(len(rewards))):
+                gae = deltas[t] + self.gamma * self.lam * (1 - dones[t]) * gae
+                adv[t] = gae
+            returns = adv + values
+            # --- observation normalization ---
+            self.observeNorm.update(states)
+            states = self.observeNorm.normalize(states)
+            # Advantage normalization
+            #adv = (adv - adv.mean()) / (adv.std(unbiased=False) + 1e-8)
+        # --- PPO Multiple Epochs + Minibatch ---
+        total_loss_epoch = 0.0
+        num_samples = len(states)
+        batch_size = min(64, num_samples)
+        ppo_epochs = 4
+        for _ in range(ppo_epochs):
+            # Shuffle indices
+            idxs = T.randperm(num_samples)
+            for start in range(0, num_samples, batch_size):
+                batch_idx = idxs[start:start + batch_size]
+                b_states = states[batch_idx]
+                b_actions = actions[batch_idx]
+                b_old_logp = old_logp[batch_idx]
+                b_returns = returns[batch_idx]
+                b_adv = adv[batch_idx]
+                dist = self.policy.next_action(b_states)
+                new_logp = dist.log_prob(b_actions)
+                entropy = dist.entropy().mean()
+                ratio = (new_logp - b_old_logp).exp()
+                # --- Clipped surrogate objective ---
+                surr1 = ratio * b_adv
+                surr2 = T.clamp(ratio, 1 - self.clip, 1 + self.clip) * b_adv
+                policy_loss = -T.min(surr1, surr2).mean()
+                # --- Critic loss ---
+                value_pred = self.critic.evaluated_state(b_states)
+                value_loss = 0.5 * (b_returns - value_pred).pow(2).mean()
+                # --- Total loss ---
+                total_loss = (
+                        policy_loss +
+                        self.value_coef * value_loss -
+                        self.entropy_coef * entropy
+                )
+                # Debug: track individual loss components
+                self.policy_loss_history.append(policy_loss.item())
+                self.value_loss_history.append(value_loss.item())
+                self.opt.zero_grad(set_to_none=True)
+                total_loss.backward()
+                self.opt.step()
+                total_loss_epoch += total_loss.item()
+        # Clear memory after full PPO update
+        self.memory.clear()
+        return total_loss_epoch / (ppo_epochs * (num_samples / batch_size))
+    def update_advantage_norm(self):
+        if len(self.memory.states) == 0:
+            return 0.0
+        # Convert memory to tensors
+        states = T.as_tensor(np.array(self.memory.states), dtype=T.float32, device=self.device)
+        actions = T.as_tensor(self.memory.actions, dtype=T.long, device=self.device)
+        rewards = T.as_tensor(self.memory.rewards, dtype=T.float32, device=self.device)
+        dones = T.as_tensor(self.memory.dones, dtype=T.float32, device=self.device)
+        old_logp = T.as_tensor(self.memory.log_probs, dtype=T.float32, device=self.device)
+        values = T.as_tensor(self.memory.values, dtype=T.float32, device=self.device)
+        with T.no_grad():
+            # Compute next values (bootstrap for final step)
+            next_values = T.cat([values[1:], values[-1:].clone()])
+            deltas = rewards + self.gamma * next_values * (1 - dones) - values
+            # --- GAE-Lambda ---
+            adv = T.zeros_like(rewards)
+            gae = 0.0
+            for t in reversed(range(len(rewards))):
+                gae = deltas[t] + self.gamma * self.lam * (1 - dones[t]) * gae
+                adv[t] = gae
+            returns = adv + values
+            # --- Advantage normalization ---
+            self.advantageNorm.update(adv)
+            adv = self.advantageNorm.normalize(adv)
+        # --- PPO Multiple Epochs + Minibatch ---
+        total_loss_epoch = 0.0
+        num_samples = len(states)
+        batch_size = min(64, num_samples)
+        ppo_epochs = 4
+        for _ in range(ppo_epochs):
+            # Shuffle indices
+            idxs = T.randperm(num_samples)
+            for start in range(0, num_samples, batch_size):
+                batch_idx = idxs[start:start + batch_size]
+                b_states = states[batch_idx]
+                b_actions = actions[batch_idx]
+                b_old_logp = old_logp[batch_idx]
+                b_returns = returns[batch_idx]
+                b_adv = adv[batch_idx]
+                dist = self.policy.next_action(b_states)
+                new_logp = dist.log_prob(b_actions)
+                entropy = dist.entropy().mean()
+                ratio = (new_logp - b_old_logp).exp()
+                # --- Clipped surrogate objective ---
+                surr1 = ratio * b_adv
+                surr2 = T.clamp(ratio, 1 - self.clip, 1 + self.clip) * b_adv
+                policy_loss = -T.min(surr1, surr2).mean()
+                # --- Critic loss ---
+                value_pred = self.critic.evaluated_state(b_states)
+                value_loss = 0.5 * (b_returns - value_pred).pow(2).mean()
+                # --- Total loss ---
+                total_loss = (
+                        policy_loss +
+                        self.value_coef * value_loss -
+                        self.entropy_coef * entropy
+                )
+                # Debug: track individual loss components
+                self.policy_loss_history.append(policy_loss.item())
+                self.value_loss_history.append(value_loss.item())
+                self.opt.zero_grad(set_to_none=True)
+                total_loss.backward()
+                self.opt.step()
+                total_loss_epoch += total_loss.item()
+        # Clear memory after full PPO update
+        self.memory.clear()
+        return total_loss_epoch / (ppo_epochs * (num_samples / batch_size))
+    def update_return_norm(self):
+        if len(self.memory.states) == 0:
+            return 0.0
+        # Convert memory to tensors
+        states = T.as_tensor(np.array(self.memory.states), dtype=T.float32, device=self.device)
+        actions = T.as_tensor(self.memory.actions, dtype=T.long, device=self.device)
+        rewards = T.as_tensor(self.memory.rewards, dtype=T.float32, device=self.device)
+        dones = T.as_tensor(self.memory.dones, dtype=T.float32, device=self.device)
+        old_logp = T.as_tensor(self.memory.log_probs, dtype=T.float32, device=self.device)
+        values = T.as_tensor(self.memory.values, dtype=T.float32, device=self.device)
+        with T.no_grad():
+            # Compute next values (bootstrap for final step)
+            next_values = T.cat([values[1:], values[-1:].clone()])
+            deltas = rewards + self.gamma * next_values * (1 - dones) - values
+            # --- GAE-Lambda ---
+            adv = T.zeros_like(rewards)
+            gae = 0.0
+            for t in reversed(range(len(rewards))):
+                gae = deltas[t] + self.gamma * self.lam * (1 - dones[t]) * gae
+                adv[t] = gae
+            returns = adv + values
+            # --- returns normalization ---
+            self.returnNorm.update(returns)
+            returns = self.returnNorm.normalize(returns)
+            # Advantage normalization
+            #adv = (adv - adv.mean()) / (adv.std(unbiased=False) + 1e-8)
+        # --- PPO Multiple Epochs + Minibatch ---
+        total_loss_epoch = 0.0
+        num_samples = len(states)
+        batch_size = min(64, num_samples)
+        ppo_epochs = 4
+        for _ in range(ppo_epochs):
+            # Shuffle indices
+            idxs = T.randperm(num_samples)
+            for start in range(0, num_samples, batch_size):
+                batch_idx = idxs[start:start + batch_size]
+                b_states = states[batch_idx]
+                b_actions = actions[batch_idx]
+                b_old_logp = old_logp[batch_idx]
+                b_returns = returns[batch_idx]
+                b_adv = adv[batch_idx]
+                dist = self.policy.next_action(b_states)
+                new_logp = dist.log_prob(b_actions)
+                entropy = dist.entropy().mean()
+                ratio = (new_logp - b_old_logp).exp()
+                # --- Clipped surrogate objective ---
+                surr1 = ratio * b_adv
+                surr2 = T.clamp(ratio, 1 - self.clip, 1 + self.clip) * b_adv
+                policy_loss = -T.min(surr1, surr2).mean()
+                # --- Critic loss ---
+                value_pred = self.critic.evaluated_state(b_states)
+                value_loss = 0.5 * (b_returns - value_pred).pow(2).mean()
+                # --- Total loss ---
+                total_loss = (
+                        policy_loss +
+                        self.value_coef * value_loss -
+                        self.entropy_coef * entropy
+                )
+                # Debug: track individual loss components
+                self.policy_loss_history.append(policy_loss.item())
+                self.value_loss_history.append(value_loss.item())
+                self.opt.zero_grad(set_to_none=True)
+                total_loss.backward()
+                self.opt.step()
+                total_loss_epoch += total_loss.item()
+        # Clear memory after full PPO update
+        self.memory.clear()
+        return total_loss_epoch / (ppo_epochs * (num_samples / batch_size))
+    def update_reward_gradient_clipping(self):
+        if len(self.memory.states) == 0:
+            return 0.0
+        # Convert memory to tensors
+        states = T.as_tensor(np.array(self.memory.states), dtype=T.float32, device=self.device)
+        actions = T.as_tensor(self.memory.actions, dtype=T.long, device=self.device)
+        rewards = T.as_tensor(self.memory.rewards, dtype=T.float32, device=self.device)
+        dones = T.as_tensor(self.memory.dones, dtype=T.float32, device=self.device)
+        old_logp = T.as_tensor(self.memory.log_probs, dtype=T.float32, device=self.device)
+        values = T.as_tensor(self.memory.values, dtype=T.float32, device=self.device)
+        # Reward clipping
+        rewards = T.clamp(rewards, -1, 1)
+        with T.no_grad():
+            # Compute next values (bootstrap for final step)
+            next_values = T.cat([values[1:], values[-1:].clone()])
+            deltas = rewards + self.gamma * next_values * (1 - dones) - values
+            # --- GAE-Lambda ---
+            adv = T.zeros_like(rewards)
+            gae = 0.0
+            for t in reversed(range(len(rewards))):
+                gae = deltas[t] + self.gamma * self.lam * (1 - dones[t]) * gae
+                adv[t] = gae
+            returns = adv + values
+            # Advantage normalization
+            #adv = (adv - adv.mean()) / (adv.std(unbiased=False) + 1e-8)
+        # --- PPO Multiple Epochs + Minibatch ---
+        total_loss_epoch = 0.0
+        num_samples = len(states)
+        batch_size = min(64, num_samples)
+        ppo_epochs = 4
+        for _ in range(ppo_epochs):
+            # Shuffle indices
+            idxs = T.randperm(num_samples)
+            for start in range(0, num_samples, batch_size):
+                batch_idx = idxs[start:start + batch_size]
+                b_states = states[batch_idx]
+                b_actions = actions[batch_idx]
+                b_old_logp = old_logp[batch_idx]
+                b_returns = returns[batch_idx]
+                b_adv = adv[batch_idx]
+                dist = self.policy.next_action(b_states)
+                new_logp = dist.log_prob(b_actions)
+                entropy = dist.entropy().mean()
+                ratio = (new_logp - b_old_logp).exp()
+                # --- Clipped surrogate objective ---
+                surr1 = ratio * b_adv
+                surr2 = T.clamp(ratio, 1 - self.clip, 1 + self.clip) * b_adv
+                policy_loss = -T.min(surr1, surr2).mean()
+                # --- Critic loss ---
+                value_pred = self.critic.evaluated_state(b_states)
+                value_loss = 0.5 * (b_returns - value_pred).pow(2).mean()
+                # --- Total loss ---
+                total_loss = (
+                        policy_loss +
+                        self.value_coef * value_loss -
+                        self.entropy_coef * entropy
+                )
+                # Debug: track individual loss components
+                self.policy_loss_history.append(policy_loss.item())
+                self.value_loss_history.append(value_loss.item())
+                self.opt.zero_grad(set_to_none=True)
+                total_loss.backward()
+                T.nn.utils.clip_grad_norm_(list(self.policy.parameters()) + list(self.critic.parameters()), 0.5)
+                self.opt.step()
+                total_loss_epoch += total_loss.item()
+        # Clear memory after full PPO update
+        self.memory.clear()
+        return total_loss_epoch / (ppo_epochs * (num_samples / batch_size))
+"""
+# Policy network (simple MLP, flattened observations)
+class Policy(nn.Module):
+    def __init__(self, obs_dim: int, action_dim: int, hidden: int):
+        super().__init__()
+        self.net = nn.Sequential(
+            nn.Linear(obs_dim, hidden),
+            nn.ReLU(),
+            nn.Linear(hidden, hidden),
+            nn.ReLU(),
+            nn.Linear(hidden, action_dim)
+        )
+    def next_action(self, state: T.Tensor) -> Categorical:
+        # Returns the probability distribution over actions
+        if state.dim() == 1:
+            state = state.unsqueeze(0)
+        state = state.view(state.size(0), -1)
+        return Categorical(logits=self.net(state))
+"""
+# Policy network (CNN)
+class Policy(nn.Module):
+    def __init__(self, obs_shape: tuple, action_dim: int, hidden: int):
+        super().__init__()
+        c, h, w = obs_shape
+        # Suggested architecture for Atari: https://arxiv.org/pdf/1312.5602
+        self.cnn = nn.Sequential(
+            nn.Conv2d(c, 16, kernel_size=8, stride=4),
+            nn.ReLU(),
+            nn.Conv2d(16, 32, kernel_size=4, stride=2),
+            nn.ReLU(),
+            nn.Flatten()
+        )
+        with T.no_grad():
+            cnn_output_dim = self.cnn(T.zeros(1, c, h, w)).shape[1]
+        self.net = nn.Sequential(
+            nn.Linear(cnn_output_dim, hidden),
+            nn.ReLU(),
+            nn.Linear(hidden, action_dim)
+        )
+    def next_action(self, state: T.Tensor) -> Categorical:
+        # Returns the probability distribution over actions
+        if state.dim() == 3:
+            state = state.unsqueeze(0)
+        cnn_out = self.cnn(state)
+        return Categorical(logits=self.net(cnn_out))
+"""
+# Critic network (simple MLP, flattened observations)
+class Critic(nn.Module):
+    def __init__(self, obs_dim: int, hidden: int):
+        super().__init__()
+        self.net = nn.Sequential(
+            nn.Linear(obs_dim, hidden),
+            nn.ReLU(),
+            nn.Linear(hidden, hidden),
+            nn.ReLU(),
+            nn.Linear(hidden, 1)
+        )
+    def evaluated_state(self, x: T.Tensor) -> T.Tensor:
+        if x.dim() == 1:
+            x = x.unsqueeze(0)
+        x = x.view(x.size(0), -1)
+        return self.net(x).squeeze(-1)
+"""
+# Critic network (CNN)
+class Critic(nn.Module):
+    def __init__(self, obs_shape: tuple, hidden: int):
+        super().__init__()
+        c, h, w = obs_shape
+        # Suggested architecture for Atari: https://arxiv.org/pdf/1312.5602
+        self.cnn = nn.Sequential(
+            nn.Conv2d(c, 16, kernel_size=8, stride=4),
+            nn.ReLU(),
+            nn.Conv2d(16, 32, kernel_size=4, stride=2),
+            nn.ReLU(),
+            nn.Flatten()
+        )
+        with T.no_grad():
+            cnn_output_dim = self.cnn(T.zeros(1, c, h, w)).shape[1]
+        self.net = nn.Sequential(
+            nn.Linear(cnn_output_dim, hidden),
+            nn.ReLU(),
+            nn.Linear(hidden, 1)
+        )
+    def evaluated_state(self, x: T.Tensor) -> T.Tensor:
+        if x.dim() == 3:
+            x = x.unsqueeze(0)
+        cnn_out = self.cnn(x)
+        return self.net(cnn_out).squeeze(-1)
+class Memory():
+    def __init__(self):
+        self.states = []
+        self.actions = []
+        self.rewards = []
+        self.dones = []
+        self.log_probs = []
+        self.values = []
+        self.next_values = []
+    def store(self, state, action, reward, done, log_prob, value, next_value):
+        self.states.append(np.asarray(state, dtype=np.float32))
+        self.actions.append(int(action))
+        self.rewards.append(float(reward))
+        self.dones.append(float(done))
+        self.log_probs.append(float(log_prob))
+        self.values.append(float(value))
+        self.next_values.append(float(next_value))
+    """
+    # For mini-batch updates? To be implemented
+    def start_batch(self, batch_size: int):
+        n_states = len(self.states)
+        starts = np.arange(0, n_states, batch_size)
+        index = np.arange(n_states, dtype=np.int64)
+        np.random.shuffle(index)
+        return [index[s:s + batch_size] for s in starts]
+    """
+    def clear(self):
+        self.states = []
+        self.actions = []
+        self.rewards = []
+        self.dones = []
+        self.log_probs = []
+        self.values = []
+        self.next_values = []
+class ObservationNorm:
+    def __init__(self):
+        self.main_mean = 0
+        self.main_var = 0
+        self.count = 1e-4
+    def update(self, x: T.Tensor):
+        batch_mean = T.mean(x, dim=0)
+        batch_var = T.var(x, dim=0)
+        batch_count = x.shape[0]
+        self._update_from_moments(batch_mean, batch_var, batch_count)
+    def _update_from_moments(self, batch_mean, batch_var, batch_count):
+        delta = batch_mean - self.main_mean
+        tot_count = self.count + batch_count
+        new_mean = self.main_mean + delta * batch_count / tot_count #Update the running mean
+        m_a = self.main_var * self.count
+        m_b = batch_var * batch_count
+        M2 = m_a + m_b + np.square(delta) * self.count * batch_count / tot_count
+        new_var = M2 / tot_count # update the running variance
+        self.main_mean, self.main_var, self.count = new_mean, new_var, tot_count
+    def normalize(self, x):
+        return (x - self.main_mean) / (np.sqrt(self.main_var) + 1e-8) # We add epsilon to make sure that we don't
+                                                            # divide through zero.
+class AdvantageNorm:
+    '''
+    This class implements the Advantage Normalization. The purpose is to normalize either across batches or
+    only within the same batch.
+    '''
+    def __init__(self):
+        self.main_mean = 0
+        self.main_var = 0
+        self.count = 1e-4
+    def update(self, x: T.Tensor):
+        batch_mean = T.mean(x, dim=0)
+        batch_var = T.var(x, dim=0)
+        batch_count = x.shape[0]
+        self._update_from_moments(batch_mean, batch_var, batch_count)
+    def _update_from_moments(self, batch_mean, batch_var, batch_count):
+        delta = batch_mean - self.main_mean
+        tot_count = self.count + batch_count
+        new_mean = self.main_mean + delta * batch_count / tot_count #Update the running mean
+        m_a = self.main_var * self.count
+        m_b = batch_var * batch_count
+        M2 = m_a + m_b + np.square(delta) * self.count * batch_count / tot_count
+        new_var = M2 / tot_count # update the running variance
+        self.main_mean, self.main_var, self.count = new_mean, new_var, tot_count
+    def normalize(self, x):
+        return (x - self.main_mean) / (np.sqrt(self.main_var) + 1e-8) # We add epsilon to make sure that we don't
+                                                            # divide through zero.
+class ReturnNorm:
+    '''
+    This class implements the Advantage Normalization. The purpose is to normalize either across batches or
+    only within the same batch.
+    '''
+    def __init__(self):
+        self.main_mean = 0
+        self.main_var = 0
+        self.count = 1e-4
+    def update(self, x: T.Tensor):
+        batch_mean = T.mean(x, dim=0)
+        batch_var = T.var(x, dim=0)
+        batch_count = x.shape[0]
+        self._update_from_moments(batch_mean, batch_var, batch_count)
+    def _update_from_moments(self, batch_mean, batch_var, batch_count):
+        delta = batch_mean - self.main_mean
+        tot_count = self.count + batch_count
+        new_mean = self.main_mean + delta * batch_count / tot_count #Update the running mean
+        m_a = self.main_var * self.count
+        m_b = batch_var * batch_count
+        M2 = m_a + m_b + np.square(delta) * self.count * batch_count / tot_count
+        new_var = M2 / tot_count # update the running variance
+        self.main_mean, self.main_var, self.count = new_mean, new_var, tot_count
+    def normalize(self, x):
+        return (x - self.main_mean) / (np.sqrt(self.main_var) + 1e-8) # We add epsilon to make sure that we don't
+                                                            # divide through zero.

Observation_Advantage_Norm_running_averages/ppo_rew_norm_obs_env_running_average.py ADDED Viewed

	@@ -0,0 +1,163 @@

+import gymnasium as gym
+import sys
+import matplotlib.pyplot as plt
+import ale_py
+from ppo__rew_norm_obs_running_average import *
+from gymnasium.spaces import Box
+import cv2
+def preprocess(obs):
+    # Convert to grayscale
+    obs = cv2.cvtColor(obs, cv2.COLOR_RGB2GRAY)
+    # Resize
+    obs = cv2.resize(obs, (84, 84), interpolation=cv2.INTER_AREA)
+    # Add channel dimension and normalize
+    return np.expand_dims(obs, axis=0).astype(np.float32) / 255.0
+def rl_model(type):
+    # env = gym.make("ALE/SpaceInvaders-v5", render_mode='human')
+    # env = gym.make("ALE/Pacman-v5", render_mode="human")
+    env = gym.make("ALE/Pacman-v5")
+    episode = 0
+    total_return = 0
+    ep_return = 0
+    steps = 1000
+    batches = 100
+    print("Observation space:", env.observation_space)
+    print("Action space:", env.action_space)
+    """
+    agent = Agent(obs_space=env.observation_space, action_space=env.action_space,
+                  hidden=64, lr=3e-4, gamma=0.99, clip_coef=0.2,
+                  entropy_coef=0.01, value_coef=0.5, seed=70,
+                  batch_size = 64, ppo_epochs = 4, lam = 0.95)
+    """
+    # Initialize CNN with a dummy observation (to get correct input shape)
+    obs, _ = env.reset()
+    dummy_obs_space = Box(low=0.0, high=1.0, shape=preprocess(obs).shape)
+    update_type = type
+    agent = Agent(obs_space=dummy_obs_space, action_space=env.action_space,
+                  hidden=64, lr=3e-4, gamma=0.99, clip_coef=0.2,
+                  entropy_coef=0.01, value_coef=0.5, seed=70,
+                  batch_size=64, ppo_epochs=4, lam=0.95, update_type=update_type)
+    """
+    # Stats for Return-Based Scaling only
+    # === Return-Based Scaling stats ===
+    r_mean, r_var = 0.0, 1e-8
+    g2_mean = 1.0
+    agent.r_var = r_var
+    agent.g2_mean = g2_mean
+    """
+    try:
+        obs, info = env.reset(seed=42)
+        state = preprocess(obs)
+        loss_history = []
+        reward_history = []
+        for update in range(1, batches + 1):
+            for t in range(steps):
+                action, logp, value = agent.choose_action(state)
+                next_obs, reward, terminated, truncated, info = env.step(action)
+                done = terminated or truncated
+                next_state = preprocess(next_obs)
+                agent.remember(state, action, reward, done, logp, value, next_state)
+                ep_return += reward
+                state = next_state
+                if done:
+                    episode += 1
+                    total_return += ep_return
+                    print(f"Episode {episode} return: {ep_return:.2f}")
+                    ep_return = 0
+                    obs, info = env.reset()
+                    state = preprocess(obs)
+            # Using reward gradient clipping
+            avg_loss = agent._update()
+            # Vanilla PPO (no normalization)
+            # avg_loss = agent.vanilla_ppo_update()
+            loss_history.append(avg_loss)
+            avg_ret = (total_return / episode) if episode else 0
+            reward_history.append(avg_ret)
+            print(f"Update {update}: episodes={episode}, avg_return={avg_ret:.2f}, avg_loss={avg_loss:.4f}")
+        fig = plt.figure(figsize=(12, 8))
+        """
+        # Plot for Return-Based Scaling only
+        ax1 = plt.subplot(220)
+        ax1.plot(agent.sigma_history, label="Return σ")
+        ax1.set_xlabel("PPO Update")
+        ax1.set_ylabel("σ (Return Std)")
+        """
+        ax2 = plt.subplot(221)
+        ax2.plot(loss_history, label="Avg Loss")
+        ax2.set_ylabel("Average PPO Loss")
+        ax2.set_xlabel("PPO Update")
+        ax3 = plt.subplot(222)
+        ax3.plot(reward_history, label="Reward")
+        ax3.set_ylabel("Reward")
+        ax3.set_xlabel("PPO Update")
+        # Details about value loss and policy loss
+        ax4 = plt.subplot(223)
+        ax4.plot(agent.policy_loss_history, label="Policy Loss", alpha=0.7)
+        ax4.set_ylabel("Policy Loss")
+        ax4.set_xlabel("Training Step")
+        ax4.legend()
+        ax5 = plt.subplot(224)
+        ax5.plot(agent.value_loss_history, label="Value Loss", alpha=0.7)
+        ax5.set_ylabel("Value Loss")
+        ax5.set_xlabel("Training Step")
+        ax5.legend()
+        fig.suptitle("PPO Training Stability of type " + update_type +
+                     "-running_average")
+        fig.tight_layout()
+        plt.savefig(type +"_running_average_.png")
+    except Exception as e:
+        print(f"Error: {e}", file=sys.stderr)
+        return 1
+    finally:
+        avg = total_return / episode if episode else 0
+        print(f"\nEpisodes: {episode}, Avg return: {avg:.3f}")
+        env.close()
+    return 0
+def main() -> int:
+    type_list = ["update_observation_norm", "update_advantage_norm", "update_return_norm", "vanilla_ppo_update"]
+    for type in type_list:
+        rl_model(type)
+    return 0
+if __name__ == "__main__":
+    raise SystemExit(main())

Observation_Advantage_Norm_running_averages/update_advantage_norm_running_average_.png ADDED Viewed

Observation_Advantage_Norm_running_averages/update_observation_norm_running_average_.png ADDED Viewed

Observation_Advantage_Norm_running_averages/update_return_norm_running_average_.png ADDED Viewed

Observation_Advantage_Norm_running_averages/vanilla_ppo_update_running_average_.png ADDED Viewed