Upload 6 files

Browse files

Files changed (6) hide show

agent.py +278 -0
config.py +42 -0
main.py +246 -0
reward_shaping.py +113 -0
trained_agent.py +120 -0
utils.py +102 -0

agent.py ADDED Viewed

	@@ -0,0 +1,278 @@

+import tensorflow as tf
+from keras.layers import Dense, Normalization, Input
+from keras.models import Model
+import tensorflow_probability as tfp
+import numpy as np
+import os
+from config import *
+# Helper to normalize observations
+class RunningMeanStd:
+    def __init__(self, shape):
+        self.mean = np.zeros(shape, dtype=np.float32)
+        self.var = np.ones(shape, dtype=np.float32)
+        self.count = 1e-4
+    def update(self, x):
+        batch_mean = np.mean(x, axis=0)
+        batch_var = np.var(x, axis=0)
+        batch_count = x.shape[0]
+        self.update_from_moments(batch_mean, batch_var, batch_count)
+    def update_from_moments(self, batch_mean, batch_var, batch_count):
+        delta = batch_mean - self.mean
+        total_count = self.count + batch_count
+        new_mean = self.mean + delta * batch_count / total_count
+        m_a = self.var * self.count
+        m_b = batch_var * batch_count
+        m2 = m_a + m_b + np.square(delta) * self.count * batch_count / total_count
+        new_var = m2 / total_count
+        self.mean = new_mean
+        self.var = new_var
+        self.count = total_count
+class PPOAgent:
+    def __init__(self, obs_shape, action_size, total_timesteps):
+        self.obs_shape = obs_shape
+        self.action_size = action_size
+        # Networks
+        self.policy = self._build_policy_model(obs_shape, action_size)
+        self.value = self._build_value_model(obs_shape)
+        # Learning Rate Schedule (Crucial for PPO)
+        self.lr_schedule = tf.keras.optimizers.schedules.PolynomialDecay(
+            initial_learning_rate=LEARNING_RATE,
+            decay_steps=total_timesteps * PPO_EPOCHS / (N_STEPS * NUM_ENVS),
+            end_learning_rate=0.0
+        )
+        self.optimizer = tf.keras.optimizers.Adam(learning_rate=self.lr_schedule, epsilon=1e-5)
+        # Observation Normalizer
+        self.obs_rms = RunningMeanStd(shape=obs_shape)
+        # --- CHECKPOINTING SETUP ---
+        # Wrap RMS parameters in tf.Variable so they can be tracked and saved
+        self.rms_mean_var = tf.Variable(self.obs_rms.mean, dtype=tf.float32, name="obs_rms_mean")
+        self.rms_var_var = tf.Variable(self.obs_rms.var, dtype=tf.float32, name="obs_rms_var")
+        self.rms_count_var = tf.Variable(self.obs_rms.count, dtype=tf.float32, name="obs_rms_count")
+        self.checkpoint = tf.train.Checkpoint(
+            policy=self.policy,
+            value=self.value,
+            optimizer=self.optimizer,
+            obs_rms_mean=self.rms_mean_var,
+            obs_rms_var=self.rms_var_var,
+            obs_rms_count=self.rms_count_var
+        )
+        self.checkpoint_manager = tf.train.CheckpointManager(
+            self.checkpoint, os.path.join(SAVE_PATH, 'tf_checkpoints'), max_to_keep=1000
+        )
+    def _build_policy_model(self, obs_shape, action_size):
+        inputs = tf.keras.Input(shape=obs_shape)
+        x = Dense(64, activation='relu')(inputs)
+        x = Dense(64, activation='relu')(x)
+        logits = Dense(action_size)(x)
+        return tf.keras.Model(inputs=inputs, outputs=logits, name="policy_model")
+    def _build_value_model(self, obs_shape):
+        inputs = tf.keras.Input(shape=obs_shape)
+        x = Dense(64, activation='relu')(inputs)
+        x = Dense(64, activation='relu')(x)
+        value = Dense(1)(x)
+        return tf.keras.Model(inputs=inputs, outputs=value, name="value_model")
+    def adapt_normalization(self, initial_observations):
+        """Updates the observation normalizer using initial data and saves state to checkpoint variables."""
+        self.obs_rms.update(initial_observations)
+        # Update checkpoint variables for persistence
+        self.rms_mean_var.assign(self.obs_rms.mean)
+        self.rms_var_var.assign(self.obs_rms.var)
+        self.rms_count_var.assign(self.obs_rms.count)
+    def normalize_obs(self, obs):
+        """
+        Applies normalization. This runs in Eager mode (NumPy input) or Graph mode (Tensor input).
+        The source of RMS parameters is selected based on the input type.
+        """
+        is_tensor = tf.is_tensor(obs)
+        # Determine RMS source
+        # Use tf.Variables if input is a Tensor (for learn_step)
+        # Use NumPy arrays if input is a NumPy array (for select_action/rollout prep)
+        rms_mean = self.rms_mean_var if is_tensor else self.obs_rms.mean
+        rms_var = self.rms_var_var if is_tensor else self.obs_rms.var
+        # Convert input to float32 if it's not already a tensor
+        if not is_tensor:
+            obs = obs.astype(np.float32)
+        # Normalization calculation (must use tf functions)
+        normalized_obs = (obs - rms_mean) / tf.sqrt(rms_var + 1e-8)
+        # Clipping (using tf.clip_by_value)
+        normalized_obs = tf.clip_by_value(normalized_obs, -10.0, 10.0)
+        # Return NumPy array if the input was NumPy (used for rollout collection and saving)
+        if not is_tensor:
+            # We explicitly convert the result of the TF operations back to NumPy here
+            return normalized_obs.numpy()
+        return normalized_obs
+    # --- FIX: Removed @tf.function here to allow .numpy() calls in eager mode ---
+    def select_action(self, obs):
+        """Selects action, computes value, and log_prob for the given observation in Eager Mode."""
+        # 1. Convert incoming NumPy array to a Tensor for the model forward pass
+        obs_tensor = tf.convert_to_tensor(obs, dtype=tf.float32)
+        # 2. Normalize the observation tensor
+        normalized_obs = self.normalize_obs(obs_tensor)
+        # Forward pass (runs efficiently even without @tf.function due to Keras's tracing)
+        logits = self.policy(normalized_obs, training=False)
+        values = self.value(normalized_obs, training=False)
+        # Create categorical distribution and sample
+        distribution = tfp.distributions.Categorical(logits=logits)
+        actions = distribution.sample()
+        # Compute log probability of the sampled action
+        log_probs = distribution.log_prob(actions)
+        # 3. Convert EagerTensors back to NumPy arrays
+        actions_np = actions.numpy().astype(np.int64) # This now works in Eager Mode!
+        values_np = values.numpy().flatten()
+        log_probs_np = log_probs.numpy()
+        return actions_np, values_np, log_probs_np
+    # The learn_step remains decorated with @tf.function
+    @tf.function
+    def learn_step(self, obs, actions, old_log_probs, returns, advantages, old_values):
+        """Performs a single PPO optimization step."""
+        with tf.GradientTape() as tape:
+            # 1. Forward Pass
+            logits = self.policy(obs, training=True)
+            values = self.value(obs, training=True)
+            # 2. Compute probability ratio
+            distribution = tfp.distributions.Categorical(logits=logits)
+            log_probs = distribution.log_prob(actions)
+            ratio = tf.exp(log_probs - old_log_probs)
+            # 3. Compute Value Loss
+            values_clipped = old_values + tf.clip_by_value(values - old_values, -CLIP_RANGE, CLIP_RANGE)
+            value_loss1 = tf.square(returns - values)
+            value_loss2 = tf.square(returns - values_clipped)
+            value_loss = 0.5 * tf.reduce_mean(tf.maximum(value_loss1, value_loss2))
+            # 4. Compute Policy Loss
+            pg_loss1 = -advantages * ratio
+            pg_loss2 = -advantages * tf.clip_by_value(ratio, 1.0 - CLIP_RANGE, 1.0 + CLIP_RANGE)
+            policy_loss = tf.reduce_mean(tf.maximum(pg_loss1, pg_loss2))
+            # 5. Compute Entropy Loss
+            entropy = tf.reduce_mean(distribution.entropy())
+            # 6. Total Loss
+            total_loss = policy_loss + VALUE_COEF * value_loss - ENTROPY_COEF * entropy
+        # Apply gradients
+        grads = tape.gradient(total_loss, self.policy.trainable_variables + self.value.trainable_variables)
+        grads, _ = tf.clip_by_global_norm(grads, MAX_GRAD_NORM)
+        self.optimizer.apply_gradients(zip(grads, self.policy.trainable_variables + self.value.trainable_variables))
+        return -policy_loss, value_loss, entropy
+    def learn(self, ppo_batch):
+        """PPO update loop with data preparation and mini-batching."""
+        # 1. Update RMS and sync variables (in eager mode)
+        self.obs_rms.update(ppo_batch['observations'])
+        self.rms_mean_var.assign(self.obs_rms.mean)
+        self.rms_var_var.assign(self.obs_rms.var)
+        self.rms_count_var.assign(self.obs_rms.count)
+        # 2. Normalize observations using NumPy-based RMS object
+        obs = self.normalize_obs(ppo_batch['observations'])
+        actions = ppo_batch['actions']
+        old_log_probs = ppo_batch['log_probs']
+        returns = ppo_batch['returns']
+        advantages = ppo_batch['advantages']
+        old_values = ppo_batch['old_values']
+        # Normalizing advantages is critical for PPO stability
+        advantages = (advantages - np.mean(advantages)) / (np.std(advantages) + 1e-8)
+        # 3. Cast all prepared data to TensorFlow tensors for graph execution
+        obs_tensor = tf.convert_to_tensor(obs, dtype=tf.float32)
+        actions_tensor = tf.convert_to_tensor(actions, dtype=np.int64)
+        old_log_probs_tensor = tf.convert_to_tensor(old_log_probs, dtype=tf.float32)
+        returns_tensor = tf.convert_to_tensor(returns.flatten(), dtype=tf.float32)
+        advantages_tensor = tf.convert_to_tensor(advantages.flatten(), dtype=tf.float32)
+        old_values_tensor = tf.convert_to_tensor(old_values.flatten(), dtype=tf.float32)
+        batch_size = obs_tensor.shape[0]
+        minibatch_size = batch_size // NUM_MINIBATCHES
+        policy_losses, value_losses, entropies = [], [], []
+        for epoch in range(PPO_EPOCHS):
+            # Shuffle indices
+            indices = tf.range(batch_size)
+            shuffled_indices = tf.random.shuffle(indices)
+            for start in range(0, batch_size, minibatch_size):
+                end = start + minibatch_size
+                minibatch_indices = shuffled_indices[start:end]
+                # Gather minibatch data (this happens efficiently on CPU/GPU)
+                mb_obs = tf.gather(obs_tensor, minibatch_indices)
+                mb_actions = tf.gather(actions_tensor, minibatch_indices)
+                mb_old_log_probs = tf.gather(old_log_probs_tensor, minibatch_indices)
+                mb_returns = tf.gather(returns_tensor, minibatch_indices)
+                mb_advantages = tf.gather(advantages_tensor, minibatch_indices)
+                mb_old_values = tf.gather(old_values_tensor, minibatch_indices)
+                # Perform the learning step (runs in the traced graph)
+                p_loss, v_loss, entropy = self.learn_step(
+                    mb_obs, mb_actions, mb_old_log_probs, mb_returns, mb_advantages, mb_old_values
+                )
+                policy_losses.append(p_loss.numpy())
+                value_losses.append(v_loss.numpy())
+                entropies.append(entropy.numpy())
+        return np.mean(policy_losses), np.mean(value_losses), np.mean(entropies)
+    def save_model(self, save_dir, timesteps):
+        """Saves the full checkpoint using the manager."""
+        self.checkpoint_manager.save(checkpoint_number=timesteps)
+    def load_model(self, save_dir, timesteps):
+        """Loads the last successful checkpoint."""
+        latest_checkpoint = self.checkpoint_manager.latest_checkpoint
+        if latest_checkpoint:
+            print(f"Restoring checkpoint from {latest_checkpoint}...")
+            # Restore everything tracked by the checkpoint object
+            self.checkpoint.restore(latest_checkpoint).expect_partial()
+            # --- IMPORTANT: Update self.obs_rms with restored tensor values ---
+            self.obs_rms.mean = self.rms_mean_var.numpy()
+            self.obs_rms.var = self.rms_var_var.numpy()
+            self.obs_rms.count = self.rms_count_var.numpy()
+            print("Model, Optimizer, and Normalizer restored successfully.")
+        else:
+            raise FileNotFoundError(f"No checkpoint found in {self.checkpoint_manager.directory}")

config.py ADDED Viewed

	@@ -0,0 +1,42 @@

+import tensorflow as tf
+import os
+# --- Environment Configuration ---
+ENV_ID = "LunarLander-v3"
+SEED = 123
+NUM_ENVS = 12
+# --- PPO Hyperparameters ---
+TOTAL_TIMESTEPS = 15_000_000
+N_STEPS = 4096                # Number of steps per environment per rollout (2048 * 12 = 24,576 total steps per update)
+GAMMA = 0.99
+GAE_LAMBDA = 0.95
+PPO_EPOCHS = 15
+NUM_MINIBATCHES = 4
+CLIP_RANGE = 0.1
+LEARNING_RATE = 3e-4
+RMS_WARMUP_STEPS = 5000
+# --- Loss Coefficients ---
+VALUE_COEF = 0.5
+ENTROPY_COEF = 0.1
+MAX_GRAD_NORM = 0.5
+# --- Hardware and Logging ---
+DEVICE = 'GPU' if tf.config.list_physical_devices('GPU') else 'CPU'
+if DEVICE == 'GPU':
+    gpus = tf.config.experimental.list_physical_devices('GPU')
+    if gpus:
+        try:
+            for gpu in gpus:
+                tf.config.experimental.set_memory_growth(gpu, True)
+        except RuntimeError as e:
+            print(e)
+LOG_DIR = f"./Lunar_Lander_Discrete_logs/ppo_{ENV_ID.lower()}"
+# --- Checkpointing and Resuming ---
+SAVE_PATH_ROOT = "./Lunar_Lander_Discrete_models"
+SAVE_PATH = os.path.join(SAVE_PATH_ROOT, f"ppo_{ENV_ID.lower()}")
+RESUME_FILE = f"ppo_{ENV_ID.lower()}_resume.json"
+# We save the checkpoint every 21 rollouts
+CHECKPOINT_FREQ = N_STEPS * NUM_ENVS * 21 # Current value: 540,672 timesteps, after initial 49,152 that include RMS_WARMUP_STEPS

main.py ADDED Viewed

	@@ -0,0 +1,246 @@

+import tensorflow as tf
+import numpy as np
+import os
+import time
+from datetime import datetime
+# Import all constants
+from config import *
+from utils import make_parallel_envs, save_resume_data, load_resume_data
+from agent import PPOAgent
+# --- GAE Calculation ---
+def compute_gae(rewards, values, dones, next_value, gamma, gae_lambda):
+    rewards = rewards.astype(np.float32)
+    values = values.astype(np.float32)
+    dones = dones.astype(np.float32)
+    next_value = next_value.astype(np.float32)
+    advantages = np.zeros_like(rewards, dtype=np.float32)
+    last_gae_lambda = np.zeros_like(rewards[0], dtype=np.float32)
+    # Concatenate current values and next_value for V(s')
+    values = np.concatenate([values, next_value[None, :]], axis=0)
+    for t in reversed(range(N_STEPS)):
+        # GAE calculation (delta = R + gamma * V(s') * (1-dones) - V(s))
+        delta = rewards[t] + gamma * values[t + 1] * (1 - dones[t]) - values[t]
+        # GAE propagation (A_t = delta_t + gamma * lambda * (1 - done_t) * A_{t+1})
+        last_gae_lambda = delta + gamma * gae_lambda * (1 - dones[t]) * last_gae_lambda
+        advantages[t] = last_gae_lambda
+    # R = A + V
+    returns = advantages + values[:-1]
+    return advantages, returns
+# --- End GAE Calculation ---
+def train():
+    print(f"--- Running on {DEVICE} ---")
+    os.makedirs(LOG_DIR, exist_ok=True)
+    os.makedirs(SAVE_PATH, exist_ok=True)
+    summary_writer = tf.summary.create_file_writer(LOG_DIR)
+    vec_env = make_parallel_envs(ENV_ID, NUM_ENVS, SEED)
+    obs_shape = vec_env.single_observation_space.shape
+    action_size = vec_env.single_action_space.n
+    ACTION_DTYPE = vec_env.action_space.dtype
+    print(f"Action space DTYPE retrieved: {ACTION_DTYPE}. Forcing agent output to this type.")
+    agent = PPOAgent(obs_shape, action_size, TOTAL_TIMESTEPS)
+    resume_path = os.path.join(os.path.dirname(SAVE_PATH) or '.', RESUME_FILE)
+    initial_timesteps, initial_episode = load_resume_data(resume_path)
+    current_timesteps = initial_timesteps
+    current_episode = initial_episode
+    obs, info = vec_env.reset(seed=SEED)
+    # --- CRITICAL FIX FOR LOGGING RELIABILITY: Global Tracking List ---
+    all_episode_returns = []
+    # --- END CRITICAL FIX ---
+    # --- INITIAL NORMALIZATION ADAPTATION ---
+    if current_timesteps == 0:
+        print("Adapting observation normalizer for stability...")
+        initial_observations = []
+        current_obs = obs
+        for _ in range(RMS_WARMUP_STEPS // NUM_ENVS):
+            action_array = vec_env.action_space.sample()
+            actions_to_step = np.ascontiguousarray(action_array.reshape(NUM_ENVS).astype(ACTION_DTYPE))
+            current_obs, _, _, _, _ = vec_env.step(actions_to_step)
+            initial_observations.append(current_obs)
+        initial_observations = np.array(initial_observations).reshape(-1, obs_shape[0])
+        agent.adapt_normalization(initial_observations)
+        obs, info = vec_env.reset(seed=SEED)
+        print("Normalizer adapted. Starting training.")
+    elif initial_timesteps > 0:
+        try:
+            agent.load_model(SAVE_PATH, initial_timesteps)
+            print(f"Resumed training from timestep: {initial_timesteps}")
+        except Exception as e:
+            print(f"Error loading model weights at {initial_timesteps}: {e}. Starting from scratch.")
+            current_timesteps = 0
+            current_episode = 0
+            obs, info = vec_env.reset(seed=SEED)
+    start_time = time.time()
+    base_timesteps = current_timesteps
+    while current_timesteps < TOTAL_TIMESTEPS:
+        rollout_data = {'observations': [], 'actions': [], 'log_probs': [],
+                        'rewards': [], 'values': [], 'dones': [], 'old_values': []}
+        episode_returns_list = [] # Tracks returns for CLI output
+        for step in range(N_STEPS):
+            actions, values, log_probs = agent.select_action(obs)
+            actions_to_step = np.ascontiguousarray(actions.reshape(NUM_ENVS).astype(ACTION_DTYPE))
+            new_obs, rewards, terminateds, truncateds, infos = vec_env.step(actions_to_step)
+            dones = np.logical_or(terminateds, truncateds)
+            # --- DEBUG PRINT ---
+            #if np.any(dones):
+                #print(f"DEBUG STEP: Done signal received! Raw infos keys: {infos.keys()}")
+                #if 'final_info' in infos:
+                    #print(f"DEBUG STEP: final_info key IS present.")
+                #elif 'episode' in infos:
+                     #print(f"DEBUG STEP: 'final_info' key IS MISSING, but raw 'episode' key IS present. Attempting fallback extraction.")
+                #else:
+                    #print("DEBUG STEP: 'final_info' key IS MISSING from infos dictionary.")
+            # --- END DEBUG PRINT ---
+            # Store data
+            rollout_data['observations'].append(np.array(obs).copy())
+            rollout_data['actions'].append(actions_to_step.copy())
+            rollout_data['log_probs'].append(np.array(log_probs).copy())
+            rollout_data['values'].append(np.array(values).copy())
+            rollout_data['old_values'].append(np.array(values).copy())
+            rollout_data['rewards'].append(np.array(rewards).copy())
+            rollout_data['dones'].append(np.array(dones).copy())
+            obs = new_obs
+            # --- ASYNCVECTORENV Logging Mechanism (Extract data) ---
+            log_step = base_timesteps + ((step + 1) * NUM_ENVS)
+            # 1. STANDARD EXTRACTION (EXPECTED BEHAVIOR: final_info is a list of dicts)
+            if 'final_info' in infos:
+                env_infos_to_process = infos['final_info']
+            # 2. FALLBACK EXTRACTION (YOUR OBSERVED BEHAVIOR: raw 'episode' key is present)
+            # This is less common but necessary for your specific environment output
+            elif 'episode' in infos:
+                # We assume 'episode' is a dict containing lists of returns/lengths for finished episodes
+                # The 'episode' key is often where single-env stats get dumped if not aggregated.
+                env_infos_to_process = [{'episode': {'r': r, 'l': l}}
+                                         for r, l in zip(infos['episode']['r'], infos['episode']['l'])]
+            else:
+                env_infos_to_process = []
+            for env_info in env_infos_to_process:
+                # Must be a dictionary and not None
+                if env_info is not None and isinstance(env_info, dict):
+                    # The episode stats are nested under the 'episode' key
+                    if 'episode' in env_info and isinstance(env_info['episode'], dict):
+                        episode_stats = env_info['episode']
+                        # Explicitly check for 'r' (return) and 'l' (length) keys
+                        if 'r' in episode_stats and 'l' in episode_stats:
+                            episode_return = float(episode_stats['r'])
+                            episode_length = int(episode_stats['l'])
+                            # CRITICAL: Populate both lists
+                            episode_returns_list.append(episode_return)
+                            all_episode_returns.append(episode_return) # <-- Global List used for checkpoint logging
+                            current_episode += 1
+                            with summary_writer.as_default():
+                                tf.summary.scalar("rollout/episode_return_single", episode_return, step=log_step)
+                                tf.summary.scalar("rollout/episode_length_single", episode_length, step=log_step)
+            # --- END ASYNCVECTORENV Logging Mechanism ---
+        # Increment total timesteps after the rollout is collected
+        current_timesteps += N_STEPS * NUM_ENVS
+        base_timesteps = current_timesteps
+        # --- PPO UPDATE ---
+        rollout_data = {k: np.asarray(v) for k, v in rollout_data.items()}
+        _, next_values, _ = agent.select_action(obs)
+        rollout_data['advantages'], rollout_data['returns'] = compute_gae(
+            rollout_data['rewards'],
+            rollout_data['values'],
+            rollout_data['dones'],
+            next_values,
+            GAMMA,
+            GAE_LAMBDA
+        )
+        ppo_batch = {}
+        for k in ['observations', 'actions', 'log_probs', 'returns', 'advantages', 'old_values']:
+            if rollout_data[k].ndim > 2:
+                ppo_batch[k] = rollout_data[k].reshape((-1,) + rollout_data[k].shape[2:])
+            else:
+                ppo_batch[k] = rollout_data[k].reshape((-1,))
+        policy_loss, value_loss, entropy = agent.learn(ppo_batch)
+        # 4. Logging
+        fps = current_timesteps / (time.time() - start_time)
+        with summary_writer.as_default():
+            tf.summary.scalar("train/policy_loss", policy_loss, step=current_timesteps)
+            tf.summary.scalar("train/value_loss", value_loss, step=current_timesteps)
+            tf.summary.scalar("train/entropy", entropy, step=current_timesteps)
+            tf.summary.scalar("perf/timesteps_per_second", fps, step=current_timesteps)
+            if episode_returns_list:
+                avg_rollout_return = np.mean(episode_returns_list)
+                tf.summary.scalar("rollout/rollout_average_return_rollout", avg_rollout_return, step=current_timesteps)
+            tf.summary.scalar("train/entropy_coef", ENTROPY_COEF, step=current_timesteps)
+            summary_writer.flush()
+        # CLI Reporting
+        if episode_returns_list:
+            avg_return = np.mean(episode_returns_list)
+            print(f"T: {current_timesteps:<10} | Avg Return: {avg_return:7.2f} | Loss: {policy_loss:7.4f} | FPS: {fps:4.0f}")
+        else:
+            print(f"T: {current_timesteps:<10} | No episodes completed this rollout. | Loss: {policy_loss:7.4f} | FPS: {fps:4.0f}")
+        # 5. Checkpointing & Guaranteed Logging (The Desperate Fix)
+        # Checkpoint is saved if the timesteps land on the defined frequency
+        if (current_timesteps - N_STEPS * NUM_ENVS) % CHECKPOINT_FREQ == 0:
+            # --- DIAGNOSTIC CHECK ---
+            print(f"--- DIAGNOSTIC: Total recorded episodes: {len(all_episode_returns)} ---")
+            # --- END DIAGNOSTIC CHECK ---
+            # NEW LOGGING: Calculate and log the overall average reward since start
+            if all_episode_returns:
+                # Use the last 200 episodes for a smooth, reliable moving average
+                overall_avg_return = np.mean(all_episode_returns[-200:])
+                with summary_writer.as_default():
+                    # This new tag should ALWAYS populate in TensorBoard
+                    tf.summary.scalar("rollout/overall_average_return_checkpoint", overall_avg_return, step=current_timesteps)
+                print(f"--- CHECKPOINT: Overall Avg Reward (Last 200): {overall_avg_return:.2f} ---")
+            # Save model and resume data
+            agent.save_model(SAVE_PATH, current_timesteps)
+            save_resume_data(resume_path, current_timesteps, current_episode)
+            print(f"Checkpoint saved at timestep: {current_timesteps}")
+    # Final Save after loop termination
+    print(f"\nTarget reached ({TOTAL_TIMESTEPS} timesteps). Saving final model.")
+    agent.save_model(SAVE_PATH, current_timesteps)
+    save_resume_data(resume_path, current_timesteps, current_episode)
+    vec_env.close()
+    summary_writer.close()
+    print("Training complete.")
+if __name__ == "__main__":
+    train()

reward_shaping.py ADDED Viewed

	@@ -0,0 +1,113 @@

+import gymnasium as gym
+import numpy as np
+from gymnasium.spaces import Box
+from gymnasium import Wrapper
+# --- CONFIGURATION FOR REWARD SHAPING ---
+# Sustainable time penalty for fuel efficiency.
+TIME_PENALTY = -0.05
+# ----------------------------------------
+class LunarLanderRewardShaping(Wrapper):
+    """
+    Hyper-optimized structure for perfect, stable, centered, and fuel-efficient landing.
+    Uses catastrophic penalties to enforce terminal stabilization and centering.
+    """
+    def __init__(self, env):
+        super().__init__(env)
+        self.last_shaping_reward = None
+    def step(self, action):
+        observation, reward, terminated, truncated, info = self.env.step(action)
+        # 1. Unpack relevant variables from observation
+        x_pos = observation[0]
+        y_pos = observation[1]
+        x_vel = observation[2]
+        y_vel = observation[3]
+        angle = observation[4]
+        angular_vel = observation[5]
+        # Leg contact boolean/float values
+        left_leg_contact = observation[6]
+        right_leg_contact = observation[7]
+        # Determine if the main engine (action 2) was fired
+        main_engine_fired = 1 if action == 2 else 0
+        # Determine if *any* thruster (actions 1, 2, or 3) was fired
+        any_thruster_fired = 1 if action != 0 else 0
+        # Proximity factor (close to 1.0 near the ground, 0.0 high up)
+        proximity_factor = 1.0 - y_pos
+        # 2. Calculate the total current shaping value
+        current_shaping_reward = 0.0
+        # A. CRITICAL FIX: Height-Weighted Horizontal Position Penalty
+        # Heavily penalizes being off-center (-100) ONLY when close to the ground.
+        # This forces the agent to center its X-position perfectly at low altitude.
+        current_shaping_reward += -100 * np.abs(x_pos) * proximity_factor
+        # B. Vertical velocity penalty near the ground (remains strong for soft landing)
+        current_shaping_reward += -40 * np.abs(y_vel) * proximity_factor
+        # C. Scaled back Horizontal speed penalty
+        # Relaxed (-5) to encourage smooth lateral coasting to save side-thruster fuel.
+        current_shaping_reward += -5 * np.abs(x_vel)
+        # D. Sharpened Angle Penalty
+        # Strong pressure (-10) for holding a perfectly vertical attitude.
+        current_shaping_reward += -10 * np.abs(angle)
+        # E. Penalize high spin rate
+        current_shaping_reward += -10 * np.abs(angular_vel)
+        # F. Main Engine usage penalty near the ground
+        current_shaping_reward += -20 * main_engine_fired * proximity_factor
+        # G. Relaxed Height Penalty (Pressure for descent, but not panic)
+        current_shaping_reward += -10 * y_pos
+        # H. CRITICAL FIX: Catastrophic Post-Contact Thrust Penalty
+        # Increased to -2500 to ensure ANY post-contact thrust is worse than failing the episode.
+        # This forces the agent to choose "Action 0: Do Nothing" immediately upon touchdown.
+        contact_sum = left_leg_contact + right_leg_contact
+        if contact_sum > 0:
+            current_shaping_reward += -3000 * any_thruster_fired * contact_sum
+        # I. Aggressive Landing Leg Use Incentive
+        current_shaping_reward += 15 * contact_sum
+        # 3. Calculate the differential shaping reward (reward for improvement)
+        if self.last_shaping_reward is not None:
+            shaping_reward_diff = current_shaping_reward - self.last_shaping_reward
+            # Clip differential reward to prevent massive, unstable jumps
+            reward += np.clip(shaping_reward_diff, -10.0, 10.0)
+        self.last_shaping_reward = current_shaping_reward
+        # --- SUSTAINABLE TIME PENALTY (Now -0.05) ---
+        reward += TIME_PENALTY
+        # ---------------------------------------------
+        return observation, reward, terminated, truncated, info
+    def reset(self, **kwargs):
+        """Resets the environment and the shaping tracker."""
+        self.last_shaping_reward = None
+        return self.env.reset(**kwargs)
+    def render(self):
+        """Passes render call to the base environment."""
+        return self.env.render()
+    def close(self):
+        """Passes close call to the base environment."""
+        return self.env.close()
+    def __getattr__(self, name):
+        """Delegate attribute access (like observation_space) to the wrapped environment."""
+        return getattr(self.env, name)

trained_agent.py ADDED Viewed

	@@ -0,0 +1,120 @@

+import gymnasium as gym
+import numpy as np
+import tensorflow as tf
+import os
+import time
+# Assuming config, agent, and utils are in the same directory
+from config import ENV_ID, SEED, SAVE_PATH, TOTAL_TIMESTEPS
+from agent import PPOAgent
+# --- CRITICAL NEW IMPORT ---
+# Import the custom wrapper class used during training
+from reward_shaping import LunarLanderRewardShaping
+# ---------------------------
+# Ensure eager execution is enabled (usually the default)
+tf.config.run_functions_eagerly(True)
+def run_trained_agent(episodes=10):
+    """
+    Loads the latest available trained PPO agent checkpoint and runs it for a specified number of episodes.
+    """
+    print(f"--- Running Trained Agent on {ENV_ID} with Human Rendering ---")
+    # 1. Environment Setup: Use a single, non-vectorized environment for clean inference.
+    try:
+        # Create the base environment
+        env = gym.make(ENV_ID, render_mode="human")
+        # --- CRITICAL FIX: Wrap the environment as done during training ---
+        # The agent was trained on this wrapper, so it must be evaluated on it.
+        env = LunarLanderRewardShaping(env)
+        # -----------------------------------------------------------------
+    except Exception as e:
+        print(f"ERROR: Could not create environment {ENV_ID} or apply wrapper. Details: {e}")
+        return
+    obs_shape = env.observation_space.shape
+    # --- FIX 2: Explicitly pass the integer size (.n) instead of the Discrete object ---
+    action_size = env.action_space.n
+    # ---------------------------------------------------------------------------------
+    # Reset the single environment
+    current_obs, info = env.reset(seed=SEED)
+    # 2. Agent Initialization
+    # Pass the integer size of the action space.
+    agent = PPOAgent(obs_shape, action_size, TOTAL_TIMESTEPS)
+    # 3. Load Latest Checkpoint
+    latest_checkpoint = agent.checkpoint_manager.latest_checkpoint
+    if not latest_checkpoint:
+        print("\nERROR: Could not find any checkpoint in the designated save path.")
+        env.close()
+        return
+    try:
+        # Restore everything tracked by the checkpoint object
+        agent.checkpoint.restore(latest_checkpoint).expect_partial()
+        # Manually sync the NumPy-based RMS object after restoration
+        agent.obs_rms.mean = agent.rms_mean_var.numpy()
+        agent.obs_rms.var = agent.rms_var_var.numpy()
+        agent.obs_rms.count = agent.rms_count_var.numpy()
+        loaded_timesteps = int(os.path.basename(latest_checkpoint).split('-')[-1])
+        print(f"\nSuccessfully loaded latest checkpoint trained to T={loaded_timesteps}")
+    except Exception as e:
+        print(f"\nERROR: Failed to restore checkpoint at {latest_checkpoint}. Details: {e}")
+        print("Suggestion: Check the consistency of your environment setup (wrapper, action size).")
+        env.close()
+        return
+    # 4. Run Episodes
+    print(f"\nStarting {episodes} playback episodes...")
+    total_rewards = []
+    for i in range(episodes):
+        done = False
+        episode_reward = 0
+        step_count = 0
+        while not done:
+            # Call env.render() inside the loop to refresh the visualization
+            env.render()
+            # current_obs is now the correct shape (8,). Reshape to (1, 8) for the agent model input.
+            obs_to_agent = current_obs.reshape(1, *obs_shape)
+            # Select action
+            actions, _, _ = agent.select_action(obs_to_agent)
+            # Step the environment with the single action
+            action_to_step = actions[0]
+            current_obs, reward, terminated, truncated, info = env.step(action_to_step)
+            done = terminated or truncated
+            episode_reward += reward
+            step_count += 1
+            # Simple visualization pause
+            time.sleep(0.01)
+        total_rewards.append(episode_reward)
+        print(f"Episode {i+1}: Reward = {episode_reward:7.2f}, Steps = {step_count}")
+        # Reset for the next episode
+        current_obs, info = env.reset()
+    # 5. Cleanup and Summary
+    env.close()
+    if total_rewards:
+        print("-" * 30)
+        print(f"Average Reward over {episodes} episodes: {np.mean(total_rewards):7.2f}")
+        print("-" * 30)
+if __name__ == "__main__":
+    run_trained_agent(episodes=10)

utils.py ADDED Viewed

	@@ -0,0 +1,102 @@

+import gymnasium as gym
+from gymnasium.vector import AsyncVectorEnv
+import numpy as np
+import tensorflow as tf
+import json
+import os
+import time
+# --- NEW: Import the reward shaping wrapper ---
+from reward_shaping import LunarLanderRewardShaping
+# ---------------------------------------------
+from config import * # Import constants like ENV_ID, N_STEPS, NUM_ENVS, GAMMA, RESUME_FILE
+# --- Helper Functions for Checkpointing ---
+def save_resume_data(filepath, timesteps, episodes):
+    """
+    Saves the current training state to a JSON file located at the specified filepath.
+    """
+    data = {
+        "timesteps": timesteps,
+        "episode_count": episodes,
+        "date": time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
+    }
+    try:
+        with open(filepath, 'w') as f:
+            json.dump(data, f)
+    except Exception as e:
+        print(f"Error saving resume data to {filepath}: {e}")
+def load_resume_data(filepath):
+    """
+    Loads the last saved training state from the specified JSON file.
+    Returns (timesteps, episode_count).
+    """
+    if os.path.exists(filepath):
+        try:
+            with open(filepath, 'r') as f:
+                data = json.load(f)
+            timesteps = data.get("timesteps", 0)
+            episodes = data.get("episode_count", 0)
+            return (timesteps, episodes)
+        except Exception as e:
+            print(f"Error loading resume data from {filepath}: {e}. Starting from scratch.")
+    return (0, 0)
+# --- Environment Setup for Parallel Execution ---
+def make_env(env_id, seed, idx, **kwargs):
+    """
+    Creates a single environment instance with a unique seed and applies necessary wrappers.
+    """
+    def thunk():
+        # 1. Create the base environment
+        env = gym.make(env_id, **kwargs)
+        # 2. Apply Custom Wrapper (LunarLanderRewardShaping)
+        # This is essential for LunarLander to learn a dense reward structure
+        env = LunarLanderRewardShaping(env)
+        # 3. Apply Standard Logging Wrapper (CRITICAL FIX FOR LOGGING)
+        # This wrapper tracks the episode reward ('r') and length ('l')
+        # and puts them into the 'infos' dictionary when the episode is done.
+        env = gym.wrappers.RecordEpisodeStatistics(env)
+        # 4. Apply seeding to the final wrapped environment
+        env.action_space.seed(seed + idx)
+        env.observation_space.seed(seed + idx)
+        # The result 'env' is the fully wrapped, final environment instance
+        return env
+    return thunk
+def make_parallel_envs(env_id, num_envs, seed):
+    """
+    Creates multiple environments and wraps them in an AsyncVectorEnv.
+    """
+    env_fns = [make_env(env_id, seed, i) for i in range(num_envs)]
+    return AsyncVectorEnv(env_fns)
+# --- GAE Calculation ---
+def calculate_gae(rewards, values, terminated, truncated, next_value, gamma=GAMMA, gae_lambda=GAE_LAMBDA):
+    """
+    Calculates Generalized Advantage Estimation (GAE) and Returns (R) from rollout data.
+    """
+    advantages = np.zeros_like(rewards, dtype=np.float32)
+    last_gae_lambda = 0
+    for t in reversed(range(N_STEPS)):
+        done_mask = 1.0 - (terminated[t] | truncated[t]).astype(np.float32)
+        if t == N_STEPS - 1:
+            next_non_terminal_value = next_value
+            next_value_actual = values[t]
+        else:
+            next_non_terminal_value = values[t + 1]
+            next_value_actual = values[t]
+        delta = rewards[t] + gamma * next_non_terminal_value * done_mask - next_value_actual
+        advantages[t] = delta + gamma * gae_lambda * done_mask * last_gae_lambda
+        last_gae_lambda = advantages[t]
+    returns = advantages + values
+    return advantages, returns