Upload 6 files
Browse files- agent.py +278 -0
- config.py +42 -0
- main.py +246 -0
- reward_shaping.py +113 -0
- trained_agent.py +120 -0
- utils.py +102 -0
agent.py
ADDED
|
@@ -0,0 +1,278 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import tensorflow as tf
|
| 2 |
+
from keras.layers import Dense, Normalization, Input
|
| 3 |
+
from keras.models import Model
|
| 4 |
+
import tensorflow_probability as tfp
|
| 5 |
+
import numpy as np
|
| 6 |
+
import os
|
| 7 |
+
from config import *
|
| 8 |
+
|
| 9 |
+
# Helper to normalize observations
|
| 10 |
+
class RunningMeanStd:
|
| 11 |
+
def __init__(self, shape):
|
| 12 |
+
self.mean = np.zeros(shape, dtype=np.float32)
|
| 13 |
+
self.var = np.ones(shape, dtype=np.float32)
|
| 14 |
+
self.count = 1e-4
|
| 15 |
+
|
| 16 |
+
def update(self, x):
|
| 17 |
+
batch_mean = np.mean(x, axis=0)
|
| 18 |
+
batch_var = np.var(x, axis=0)
|
| 19 |
+
batch_count = x.shape[0]
|
| 20 |
+
|
| 21 |
+
self.update_from_moments(batch_mean, batch_var, batch_count)
|
| 22 |
+
|
| 23 |
+
def update_from_moments(self, batch_mean, batch_var, batch_count):
|
| 24 |
+
delta = batch_mean - self.mean
|
| 25 |
+
total_count = self.count + batch_count
|
| 26 |
+
|
| 27 |
+
new_mean = self.mean + delta * batch_count / total_count
|
| 28 |
+
|
| 29 |
+
m_a = self.var * self.count
|
| 30 |
+
m_b = batch_var * batch_count
|
| 31 |
+
m2 = m_a + m_b + np.square(delta) * self.count * batch_count / total_count
|
| 32 |
+
|
| 33 |
+
new_var = m2 / total_count
|
| 34 |
+
|
| 35 |
+
self.mean = new_mean
|
| 36 |
+
self.var = new_var
|
| 37 |
+
self.count = total_count
|
| 38 |
+
|
| 39 |
+
class PPOAgent:
|
| 40 |
+
def __init__(self, obs_shape, action_size, total_timesteps):
|
| 41 |
+
self.obs_shape = obs_shape
|
| 42 |
+
self.action_size = action_size
|
| 43 |
+
|
| 44 |
+
# Networks
|
| 45 |
+
self.policy = self._build_policy_model(obs_shape, action_size)
|
| 46 |
+
self.value = self._build_value_model(obs_shape)
|
| 47 |
+
|
| 48 |
+
# Learning Rate Schedule (Crucial for PPO)
|
| 49 |
+
self.lr_schedule = tf.keras.optimizers.schedules.PolynomialDecay(
|
| 50 |
+
initial_learning_rate=LEARNING_RATE,
|
| 51 |
+
decay_steps=total_timesteps * PPO_EPOCHS / (N_STEPS * NUM_ENVS),
|
| 52 |
+
end_learning_rate=0.0
|
| 53 |
+
)
|
| 54 |
+
self.optimizer = tf.keras.optimizers.Adam(learning_rate=self.lr_schedule, epsilon=1e-5)
|
| 55 |
+
|
| 56 |
+
# Observation Normalizer
|
| 57 |
+
self.obs_rms = RunningMeanStd(shape=obs_shape)
|
| 58 |
+
|
| 59 |
+
# --- CHECKPOINTING SETUP ---
|
| 60 |
+
# Wrap RMS parameters in tf.Variable so they can be tracked and saved
|
| 61 |
+
self.rms_mean_var = tf.Variable(self.obs_rms.mean, dtype=tf.float32, name="obs_rms_mean")
|
| 62 |
+
self.rms_var_var = tf.Variable(self.obs_rms.var, dtype=tf.float32, name="obs_rms_var")
|
| 63 |
+
self.rms_count_var = tf.Variable(self.obs_rms.count, dtype=tf.float32, name="obs_rms_count")
|
| 64 |
+
|
| 65 |
+
self.checkpoint = tf.train.Checkpoint(
|
| 66 |
+
policy=self.policy,
|
| 67 |
+
value=self.value,
|
| 68 |
+
optimizer=self.optimizer,
|
| 69 |
+
obs_rms_mean=self.rms_mean_var,
|
| 70 |
+
obs_rms_var=self.rms_var_var,
|
| 71 |
+
obs_rms_count=self.rms_count_var
|
| 72 |
+
)
|
| 73 |
+
self.checkpoint_manager = tf.train.CheckpointManager(
|
| 74 |
+
self.checkpoint, os.path.join(SAVE_PATH, 'tf_checkpoints'), max_to_keep=1000
|
| 75 |
+
)
|
| 76 |
+
|
| 77 |
+
def _build_policy_model(self, obs_shape, action_size):
|
| 78 |
+
inputs = tf.keras.Input(shape=obs_shape)
|
| 79 |
+
x = Dense(64, activation='relu')(inputs)
|
| 80 |
+
x = Dense(64, activation='relu')(x)
|
| 81 |
+
logits = Dense(action_size)(x)
|
| 82 |
+
|
| 83 |
+
return tf.keras.Model(inputs=inputs, outputs=logits, name="policy_model")
|
| 84 |
+
|
| 85 |
+
def _build_value_model(self, obs_shape):
|
| 86 |
+
inputs = tf.keras.Input(shape=obs_shape)
|
| 87 |
+
x = Dense(64, activation='relu')(inputs)
|
| 88 |
+
x = Dense(64, activation='relu')(x)
|
| 89 |
+
value = Dense(1)(x)
|
| 90 |
+
return tf.keras.Model(inputs=inputs, outputs=value, name="value_model")
|
| 91 |
+
|
| 92 |
+
def adapt_normalization(self, initial_observations):
|
| 93 |
+
"""Updates the observation normalizer using initial data and saves state to checkpoint variables."""
|
| 94 |
+
self.obs_rms.update(initial_observations)
|
| 95 |
+
# Update checkpoint variables for persistence
|
| 96 |
+
self.rms_mean_var.assign(self.obs_rms.mean)
|
| 97 |
+
self.rms_var_var.assign(self.obs_rms.var)
|
| 98 |
+
self.rms_count_var.assign(self.obs_rms.count)
|
| 99 |
+
|
| 100 |
+
def normalize_obs(self, obs):
|
| 101 |
+
"""
|
| 102 |
+
Applies normalization. This runs in Eager mode (NumPy input) or Graph mode (Tensor input).
|
| 103 |
+
The source of RMS parameters is selected based on the input type.
|
| 104 |
+
"""
|
| 105 |
+
is_tensor = tf.is_tensor(obs)
|
| 106 |
+
|
| 107 |
+
# Determine RMS source
|
| 108 |
+
# Use tf.Variables if input is a Tensor (for learn_step)
|
| 109 |
+
# Use NumPy arrays if input is a NumPy array (for select_action/rollout prep)
|
| 110 |
+
rms_mean = self.rms_mean_var if is_tensor else self.obs_rms.mean
|
| 111 |
+
rms_var = self.rms_var_var if is_tensor else self.obs_rms.var
|
| 112 |
+
|
| 113 |
+
# Convert input to float32 if it's not already a tensor
|
| 114 |
+
if not is_tensor:
|
| 115 |
+
obs = obs.astype(np.float32)
|
| 116 |
+
|
| 117 |
+
# Normalization calculation (must use tf functions)
|
| 118 |
+
normalized_obs = (obs - rms_mean) / tf.sqrt(rms_var + 1e-8)
|
| 119 |
+
|
| 120 |
+
# Clipping (using tf.clip_by_value)
|
| 121 |
+
normalized_obs = tf.clip_by_value(normalized_obs, -10.0, 10.0)
|
| 122 |
+
|
| 123 |
+
# Return NumPy array if the input was NumPy (used for rollout collection and saving)
|
| 124 |
+
if not is_tensor:
|
| 125 |
+
# We explicitly convert the result of the TF operations back to NumPy here
|
| 126 |
+
return normalized_obs.numpy()
|
| 127 |
+
|
| 128 |
+
return normalized_obs
|
| 129 |
+
|
| 130 |
+
|
| 131 |
+
# --- FIX: Removed @tf.function here to allow .numpy() calls in eager mode ---
|
| 132 |
+
def select_action(self, obs):
|
| 133 |
+
"""Selects action, computes value, and log_prob for the given observation in Eager Mode."""
|
| 134 |
+
|
| 135 |
+
# 1. Convert incoming NumPy array to a Tensor for the model forward pass
|
| 136 |
+
obs_tensor = tf.convert_to_tensor(obs, dtype=tf.float32)
|
| 137 |
+
|
| 138 |
+
# 2. Normalize the observation tensor
|
| 139 |
+
normalized_obs = self.normalize_obs(obs_tensor)
|
| 140 |
+
|
| 141 |
+
# Forward pass (runs efficiently even without @tf.function due to Keras's tracing)
|
| 142 |
+
logits = self.policy(normalized_obs, training=False)
|
| 143 |
+
values = self.value(normalized_obs, training=False)
|
| 144 |
+
|
| 145 |
+
# Create categorical distribution and sample
|
| 146 |
+
distribution = tfp.distributions.Categorical(logits=logits)
|
| 147 |
+
actions = distribution.sample()
|
| 148 |
+
|
| 149 |
+
# Compute log probability of the sampled action
|
| 150 |
+
log_probs = distribution.log_prob(actions)
|
| 151 |
+
|
| 152 |
+
# 3. Convert EagerTensors back to NumPy arrays
|
| 153 |
+
actions_np = actions.numpy().astype(np.int64) # This now works in Eager Mode!
|
| 154 |
+
values_np = values.numpy().flatten()
|
| 155 |
+
log_probs_np = log_probs.numpy()
|
| 156 |
+
|
| 157 |
+
return actions_np, values_np, log_probs_np
|
| 158 |
+
|
| 159 |
+
# The learn_step remains decorated with @tf.function
|
| 160 |
+
@tf.function
|
| 161 |
+
def learn_step(self, obs, actions, old_log_probs, returns, advantages, old_values):
|
| 162 |
+
"""Performs a single PPO optimization step."""
|
| 163 |
+
with tf.GradientTape() as tape:
|
| 164 |
+
# 1. Forward Pass
|
| 165 |
+
logits = self.policy(obs, training=True)
|
| 166 |
+
values = self.value(obs, training=True)
|
| 167 |
+
|
| 168 |
+
# 2. Compute probability ratio
|
| 169 |
+
distribution = tfp.distributions.Categorical(logits=logits)
|
| 170 |
+
log_probs = distribution.log_prob(actions)
|
| 171 |
+
ratio = tf.exp(log_probs - old_log_probs)
|
| 172 |
+
|
| 173 |
+
# 3. Compute Value Loss
|
| 174 |
+
values_clipped = old_values + tf.clip_by_value(values - old_values, -CLIP_RANGE, CLIP_RANGE)
|
| 175 |
+
value_loss1 = tf.square(returns - values)
|
| 176 |
+
value_loss2 = tf.square(returns - values_clipped)
|
| 177 |
+
value_loss = 0.5 * tf.reduce_mean(tf.maximum(value_loss1, value_loss2))
|
| 178 |
+
|
| 179 |
+
# 4. Compute Policy Loss
|
| 180 |
+
pg_loss1 = -advantages * ratio
|
| 181 |
+
pg_loss2 = -advantages * tf.clip_by_value(ratio, 1.0 - CLIP_RANGE, 1.0 + CLIP_RANGE)
|
| 182 |
+
policy_loss = tf.reduce_mean(tf.maximum(pg_loss1, pg_loss2))
|
| 183 |
+
|
| 184 |
+
# 5. Compute Entropy Loss
|
| 185 |
+
entropy = tf.reduce_mean(distribution.entropy())
|
| 186 |
+
|
| 187 |
+
# 6. Total Loss
|
| 188 |
+
total_loss = policy_loss + VALUE_COEF * value_loss - ENTROPY_COEF * entropy
|
| 189 |
+
|
| 190 |
+
# Apply gradients
|
| 191 |
+
grads = tape.gradient(total_loss, self.policy.trainable_variables + self.value.trainable_variables)
|
| 192 |
+
grads, _ = tf.clip_by_global_norm(grads, MAX_GRAD_NORM)
|
| 193 |
+
self.optimizer.apply_gradients(zip(grads, self.policy.trainable_variables + self.value.trainable_variables))
|
| 194 |
+
|
| 195 |
+
return -policy_loss, value_loss, entropy
|
| 196 |
+
|
| 197 |
+
def learn(self, ppo_batch):
|
| 198 |
+
"""PPO update loop with data preparation and mini-batching."""
|
| 199 |
+
|
| 200 |
+
# 1. Update RMS and sync variables (in eager mode)
|
| 201 |
+
self.obs_rms.update(ppo_batch['observations'])
|
| 202 |
+
self.rms_mean_var.assign(self.obs_rms.mean)
|
| 203 |
+
self.rms_var_var.assign(self.obs_rms.var)
|
| 204 |
+
self.rms_count_var.assign(self.obs_rms.count)
|
| 205 |
+
|
| 206 |
+
# 2. Normalize observations using NumPy-based RMS object
|
| 207 |
+
obs = self.normalize_obs(ppo_batch['observations'])
|
| 208 |
+
|
| 209 |
+
actions = ppo_batch['actions']
|
| 210 |
+
old_log_probs = ppo_batch['log_probs']
|
| 211 |
+
returns = ppo_batch['returns']
|
| 212 |
+
advantages = ppo_batch['advantages']
|
| 213 |
+
old_values = ppo_batch['old_values']
|
| 214 |
+
|
| 215 |
+
# Normalizing advantages is critical for PPO stability
|
| 216 |
+
advantages = (advantages - np.mean(advantages)) / (np.std(advantages) + 1e-8)
|
| 217 |
+
|
| 218 |
+
# 3. Cast all prepared data to TensorFlow tensors for graph execution
|
| 219 |
+
obs_tensor = tf.convert_to_tensor(obs, dtype=tf.float32)
|
| 220 |
+
actions_tensor = tf.convert_to_tensor(actions, dtype=np.int64)
|
| 221 |
+
old_log_probs_tensor = tf.convert_to_tensor(old_log_probs, dtype=tf.float32)
|
| 222 |
+
returns_tensor = tf.convert_to_tensor(returns.flatten(), dtype=tf.float32)
|
| 223 |
+
advantages_tensor = tf.convert_to_tensor(advantages.flatten(), dtype=tf.float32)
|
| 224 |
+
old_values_tensor = tf.convert_to_tensor(old_values.flatten(), dtype=tf.float32)
|
| 225 |
+
|
| 226 |
+
batch_size = obs_tensor.shape[0]
|
| 227 |
+
minibatch_size = batch_size // NUM_MINIBATCHES
|
| 228 |
+
|
| 229 |
+
policy_losses, value_losses, entropies = [], [], []
|
| 230 |
+
|
| 231 |
+
for epoch in range(PPO_EPOCHS):
|
| 232 |
+
# Shuffle indices
|
| 233 |
+
indices = tf.range(batch_size)
|
| 234 |
+
shuffled_indices = tf.random.shuffle(indices)
|
| 235 |
+
|
| 236 |
+
for start in range(0, batch_size, minibatch_size):
|
| 237 |
+
end = start + minibatch_size
|
| 238 |
+
minibatch_indices = shuffled_indices[start:end]
|
| 239 |
+
|
| 240 |
+
# Gather minibatch data (this happens efficiently on CPU/GPU)
|
| 241 |
+
mb_obs = tf.gather(obs_tensor, minibatch_indices)
|
| 242 |
+
mb_actions = tf.gather(actions_tensor, minibatch_indices)
|
| 243 |
+
mb_old_log_probs = tf.gather(old_log_probs_tensor, minibatch_indices)
|
| 244 |
+
mb_returns = tf.gather(returns_tensor, minibatch_indices)
|
| 245 |
+
mb_advantages = tf.gather(advantages_tensor, minibatch_indices)
|
| 246 |
+
mb_old_values = tf.gather(old_values_tensor, minibatch_indices)
|
| 247 |
+
|
| 248 |
+
# Perform the learning step (runs in the traced graph)
|
| 249 |
+
p_loss, v_loss, entropy = self.learn_step(
|
| 250 |
+
mb_obs, mb_actions, mb_old_log_probs, mb_returns, mb_advantages, mb_old_values
|
| 251 |
+
)
|
| 252 |
+
policy_losses.append(p_loss.numpy())
|
| 253 |
+
value_losses.append(v_loss.numpy())
|
| 254 |
+
entropies.append(entropy.numpy())
|
| 255 |
+
|
| 256 |
+
return np.mean(policy_losses), np.mean(value_losses), np.mean(entropies)
|
| 257 |
+
|
| 258 |
+
def save_model(self, save_dir, timesteps):
|
| 259 |
+
"""Saves the full checkpoint using the manager."""
|
| 260 |
+
self.checkpoint_manager.save(checkpoint_number=timesteps)
|
| 261 |
+
|
| 262 |
+
def load_model(self, save_dir, timesteps):
|
| 263 |
+
"""Loads the last successful checkpoint."""
|
| 264 |
+
latest_checkpoint = self.checkpoint_manager.latest_checkpoint
|
| 265 |
+
|
| 266 |
+
if latest_checkpoint:
|
| 267 |
+
print(f"Restoring checkpoint from {latest_checkpoint}...")
|
| 268 |
+
# Restore everything tracked by the checkpoint object
|
| 269 |
+
self.checkpoint.restore(latest_checkpoint).expect_partial()
|
| 270 |
+
|
| 271 |
+
# --- IMPORTANT: Update self.obs_rms with restored tensor values ---
|
| 272 |
+
self.obs_rms.mean = self.rms_mean_var.numpy()
|
| 273 |
+
self.obs_rms.var = self.rms_var_var.numpy()
|
| 274 |
+
self.obs_rms.count = self.rms_count_var.numpy()
|
| 275 |
+
|
| 276 |
+
print("Model, Optimizer, and Normalizer restored successfully.")
|
| 277 |
+
else:
|
| 278 |
+
raise FileNotFoundError(f"No checkpoint found in {self.checkpoint_manager.directory}")
|
config.py
ADDED
|
@@ -0,0 +1,42 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import tensorflow as tf
|
| 2 |
+
import os
|
| 3 |
+
|
| 4 |
+
# --- Environment Configuration ---
|
| 5 |
+
ENV_ID = "LunarLander-v3"
|
| 6 |
+
SEED = 123
|
| 7 |
+
NUM_ENVS = 12
|
| 8 |
+
|
| 9 |
+
# --- PPO Hyperparameters ---
|
| 10 |
+
TOTAL_TIMESTEPS = 15_000_000
|
| 11 |
+
N_STEPS = 4096 # Number of steps per environment per rollout (2048 * 12 = 24,576 total steps per update)
|
| 12 |
+
GAMMA = 0.99
|
| 13 |
+
GAE_LAMBDA = 0.95
|
| 14 |
+
PPO_EPOCHS = 15
|
| 15 |
+
NUM_MINIBATCHES = 4
|
| 16 |
+
CLIP_RANGE = 0.1
|
| 17 |
+
LEARNING_RATE = 3e-4
|
| 18 |
+
RMS_WARMUP_STEPS = 5000
|
| 19 |
+
|
| 20 |
+
# --- Loss Coefficients ---
|
| 21 |
+
VALUE_COEF = 0.5
|
| 22 |
+
ENTROPY_COEF = 0.1
|
| 23 |
+
MAX_GRAD_NORM = 0.5
|
| 24 |
+
|
| 25 |
+
# --- Hardware and Logging ---
|
| 26 |
+
DEVICE = 'GPU' if tf.config.list_physical_devices('GPU') else 'CPU'
|
| 27 |
+
if DEVICE == 'GPU':
|
| 28 |
+
gpus = tf.config.experimental.list_physical_devices('GPU')
|
| 29 |
+
if gpus:
|
| 30 |
+
try:
|
| 31 |
+
for gpu in gpus:
|
| 32 |
+
tf.config.experimental.set_memory_growth(gpu, True)
|
| 33 |
+
except RuntimeError as e:
|
| 34 |
+
print(e)
|
| 35 |
+
LOG_DIR = f"./Lunar_Lander_Discrete_logs/ppo_{ENV_ID.lower()}"
|
| 36 |
+
|
| 37 |
+
# --- Checkpointing and Resuming ---
|
| 38 |
+
SAVE_PATH_ROOT = "./Lunar_Lander_Discrete_models"
|
| 39 |
+
SAVE_PATH = os.path.join(SAVE_PATH_ROOT, f"ppo_{ENV_ID.lower()}")
|
| 40 |
+
RESUME_FILE = f"ppo_{ENV_ID.lower()}_resume.json"
|
| 41 |
+
# We save the checkpoint every 21 rollouts
|
| 42 |
+
CHECKPOINT_FREQ = N_STEPS * NUM_ENVS * 21 # Current value: 540,672 timesteps, after initial 49,152 that include RMS_WARMUP_STEPS
|
main.py
ADDED
|
@@ -0,0 +1,246 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import tensorflow as tf
|
| 2 |
+
import numpy as np
|
| 3 |
+
import os
|
| 4 |
+
import time
|
| 5 |
+
from datetime import datetime
|
| 6 |
+
# Import all constants
|
| 7 |
+
from config import *
|
| 8 |
+
from utils import make_parallel_envs, save_resume_data, load_resume_data
|
| 9 |
+
from agent import PPOAgent
|
| 10 |
+
|
| 11 |
+
# --- GAE Calculation ---
|
| 12 |
+
def compute_gae(rewards, values, dones, next_value, gamma, gae_lambda):
|
| 13 |
+
rewards = rewards.astype(np.float32)
|
| 14 |
+
values = values.astype(np.float32)
|
| 15 |
+
dones = dones.astype(np.float32)
|
| 16 |
+
next_value = next_value.astype(np.float32)
|
| 17 |
+
advantages = np.zeros_like(rewards, dtype=np.float32)
|
| 18 |
+
last_gae_lambda = np.zeros_like(rewards[0], dtype=np.float32)
|
| 19 |
+
|
| 20 |
+
# Concatenate current values and next_value for V(s')
|
| 21 |
+
values = np.concatenate([values, next_value[None, :]], axis=0)
|
| 22 |
+
|
| 23 |
+
for t in reversed(range(N_STEPS)):
|
| 24 |
+
# GAE calculation (delta = R + gamma * V(s') * (1-dones) - V(s))
|
| 25 |
+
delta = rewards[t] + gamma * values[t + 1] * (1 - dones[t]) - values[t]
|
| 26 |
+
|
| 27 |
+
# GAE propagation (A_t = delta_t + gamma * lambda * (1 - done_t) * A_{t+1})
|
| 28 |
+
last_gae_lambda = delta + gamma * gae_lambda * (1 - dones[t]) * last_gae_lambda
|
| 29 |
+
advantages[t] = last_gae_lambda
|
| 30 |
+
|
| 31 |
+
# R = A + V
|
| 32 |
+
returns = advantages + values[:-1]
|
| 33 |
+
return advantages, returns
|
| 34 |
+
# --- End GAE Calculation ---
|
| 35 |
+
|
| 36 |
+
def train():
|
| 37 |
+
print(f"--- Running on {DEVICE} ---")
|
| 38 |
+
|
| 39 |
+
os.makedirs(LOG_DIR, exist_ok=True)
|
| 40 |
+
os.makedirs(SAVE_PATH, exist_ok=True)
|
| 41 |
+
summary_writer = tf.summary.create_file_writer(LOG_DIR)
|
| 42 |
+
|
| 43 |
+
vec_env = make_parallel_envs(ENV_ID, NUM_ENVS, SEED)
|
| 44 |
+
obs_shape = vec_env.single_observation_space.shape
|
| 45 |
+
action_size = vec_env.single_action_space.n
|
| 46 |
+
|
| 47 |
+
ACTION_DTYPE = vec_env.action_space.dtype
|
| 48 |
+
print(f"Action space DTYPE retrieved: {ACTION_DTYPE}. Forcing agent output to this type.")
|
| 49 |
+
|
| 50 |
+
agent = PPOAgent(obs_shape, action_size, TOTAL_TIMESTEPS)
|
| 51 |
+
|
| 52 |
+
resume_path = os.path.join(os.path.dirname(SAVE_PATH) or '.', RESUME_FILE)
|
| 53 |
+
initial_timesteps, initial_episode = load_resume_data(resume_path)
|
| 54 |
+
current_timesteps = initial_timesteps
|
| 55 |
+
current_episode = initial_episode
|
| 56 |
+
obs, info = vec_env.reset(seed=SEED)
|
| 57 |
+
|
| 58 |
+
# --- CRITICAL FIX FOR LOGGING RELIABILITY: Global Tracking List ---
|
| 59 |
+
all_episode_returns = []
|
| 60 |
+
# --- END CRITICAL FIX ---
|
| 61 |
+
|
| 62 |
+
# --- INITIAL NORMALIZATION ADAPTATION ---
|
| 63 |
+
if current_timesteps == 0:
|
| 64 |
+
print("Adapting observation normalizer for stability...")
|
| 65 |
+
initial_observations = []
|
| 66 |
+
current_obs = obs
|
| 67 |
+
for _ in range(RMS_WARMUP_STEPS // NUM_ENVS):
|
| 68 |
+
action_array = vec_env.action_space.sample()
|
| 69 |
+
actions_to_step = np.ascontiguousarray(action_array.reshape(NUM_ENVS).astype(ACTION_DTYPE))
|
| 70 |
+
current_obs, _, _, _, _ = vec_env.step(actions_to_step)
|
| 71 |
+
initial_observations.append(current_obs)
|
| 72 |
+
|
| 73 |
+
initial_observations = np.array(initial_observations).reshape(-1, obs_shape[0])
|
| 74 |
+
agent.adapt_normalization(initial_observations)
|
| 75 |
+
|
| 76 |
+
obs, info = vec_env.reset(seed=SEED)
|
| 77 |
+
print("Normalizer adapted. Starting training.")
|
| 78 |
+
|
| 79 |
+
elif initial_timesteps > 0:
|
| 80 |
+
try:
|
| 81 |
+
agent.load_model(SAVE_PATH, initial_timesteps)
|
| 82 |
+
print(f"Resumed training from timestep: {initial_timesteps}")
|
| 83 |
+
except Exception as e:
|
| 84 |
+
print(f"Error loading model weights at {initial_timesteps}: {e}. Starting from scratch.")
|
| 85 |
+
current_timesteps = 0
|
| 86 |
+
current_episode = 0
|
| 87 |
+
obs, info = vec_env.reset(seed=SEED)
|
| 88 |
+
|
| 89 |
+
start_time = time.time()
|
| 90 |
+
|
| 91 |
+
base_timesteps = current_timesteps
|
| 92 |
+
|
| 93 |
+
while current_timesteps < TOTAL_TIMESTEPS:
|
| 94 |
+
rollout_data = {'observations': [], 'actions': [], 'log_probs': [],
|
| 95 |
+
'rewards': [], 'values': [], 'dones': [], 'old_values': []}
|
| 96 |
+
|
| 97 |
+
episode_returns_list = [] # Tracks returns for CLI output
|
| 98 |
+
|
| 99 |
+
for step in range(N_STEPS):
|
| 100 |
+
actions, values, log_probs = agent.select_action(obs)
|
| 101 |
+
|
| 102 |
+
actions_to_step = np.ascontiguousarray(actions.reshape(NUM_ENVS).astype(ACTION_DTYPE))
|
| 103 |
+
|
| 104 |
+
new_obs, rewards, terminateds, truncateds, infos = vec_env.step(actions_to_step)
|
| 105 |
+
dones = np.logical_or(terminateds, truncateds)
|
| 106 |
+
|
| 107 |
+
# --- DEBUG PRINT ---
|
| 108 |
+
#if np.any(dones):
|
| 109 |
+
#print(f"DEBUG STEP: Done signal received! Raw infos keys: {infos.keys()}")
|
| 110 |
+
#if 'final_info' in infos:
|
| 111 |
+
#print(f"DEBUG STEP: final_info key IS present.")
|
| 112 |
+
#elif 'episode' in infos:
|
| 113 |
+
#print(f"DEBUG STEP: 'final_info' key IS MISSING, but raw 'episode' key IS present. Attempting fallback extraction.")
|
| 114 |
+
#else:
|
| 115 |
+
#print("DEBUG STEP: 'final_info' key IS MISSING from infos dictionary.")
|
| 116 |
+
# --- END DEBUG PRINT ---
|
| 117 |
+
|
| 118 |
+
# Store data
|
| 119 |
+
rollout_data['observations'].append(np.array(obs).copy())
|
| 120 |
+
rollout_data['actions'].append(actions_to_step.copy())
|
| 121 |
+
rollout_data['log_probs'].append(np.array(log_probs).copy())
|
| 122 |
+
rollout_data['values'].append(np.array(values).copy())
|
| 123 |
+
rollout_data['old_values'].append(np.array(values).copy())
|
| 124 |
+
rollout_data['rewards'].append(np.array(rewards).copy())
|
| 125 |
+
rollout_data['dones'].append(np.array(dones).copy())
|
| 126 |
+
|
| 127 |
+
obs = new_obs
|
| 128 |
+
|
| 129 |
+
# --- ASYNCVECTORENV Logging Mechanism (Extract data) ---
|
| 130 |
+
log_step = base_timesteps + ((step + 1) * NUM_ENVS)
|
| 131 |
+
|
| 132 |
+
# 1. STANDARD EXTRACTION (EXPECTED BEHAVIOR: final_info is a list of dicts)
|
| 133 |
+
if 'final_info' in infos:
|
| 134 |
+
env_infos_to_process = infos['final_info']
|
| 135 |
+
|
| 136 |
+
# 2. FALLBACK EXTRACTION (YOUR OBSERVED BEHAVIOR: raw 'episode' key is present)
|
| 137 |
+
# This is less common but necessary for your specific environment output
|
| 138 |
+
elif 'episode' in infos:
|
| 139 |
+
# We assume 'episode' is a dict containing lists of returns/lengths for finished episodes
|
| 140 |
+
# The 'episode' key is often where single-env stats get dumped if not aggregated.
|
| 141 |
+
env_infos_to_process = [{'episode': {'r': r, 'l': l}}
|
| 142 |
+
for r, l in zip(infos['episode']['r'], infos['episode']['l'])]
|
| 143 |
+
else:
|
| 144 |
+
env_infos_to_process = []
|
| 145 |
+
|
| 146 |
+
for env_info in env_infos_to_process:
|
| 147 |
+
# Must be a dictionary and not None
|
| 148 |
+
if env_info is not None and isinstance(env_info, dict):
|
| 149 |
+
# The episode stats are nested under the 'episode' key
|
| 150 |
+
if 'episode' in env_info and isinstance(env_info['episode'], dict):
|
| 151 |
+
episode_stats = env_info['episode']
|
| 152 |
+
|
| 153 |
+
# Explicitly check for 'r' (return) and 'l' (length) keys
|
| 154 |
+
if 'r' in episode_stats and 'l' in episode_stats:
|
| 155 |
+
episode_return = float(episode_stats['r'])
|
| 156 |
+
episode_length = int(episode_stats['l'])
|
| 157 |
+
|
| 158 |
+
# CRITICAL: Populate both lists
|
| 159 |
+
episode_returns_list.append(episode_return)
|
| 160 |
+
all_episode_returns.append(episode_return) # <-- Global List used for checkpoint logging
|
| 161 |
+
current_episode += 1
|
| 162 |
+
|
| 163 |
+
with summary_writer.as_default():
|
| 164 |
+
tf.summary.scalar("rollout/episode_return_single", episode_return, step=log_step)
|
| 165 |
+
tf.summary.scalar("rollout/episode_length_single", episode_length, step=log_step)
|
| 166 |
+
# --- END ASYNCVECTORENV Logging Mechanism ---
|
| 167 |
+
|
| 168 |
+
# Increment total timesteps after the rollout is collected
|
| 169 |
+
current_timesteps += N_STEPS * NUM_ENVS
|
| 170 |
+
base_timesteps = current_timesteps
|
| 171 |
+
|
| 172 |
+
# --- PPO UPDATE ---
|
| 173 |
+
rollout_data = {k: np.asarray(v) for k, v in rollout_data.items()}
|
| 174 |
+
_, next_values, _ = agent.select_action(obs)
|
| 175 |
+
rollout_data['advantages'], rollout_data['returns'] = compute_gae(
|
| 176 |
+
rollout_data['rewards'],
|
| 177 |
+
rollout_data['values'],
|
| 178 |
+
rollout_data['dones'],
|
| 179 |
+
next_values,
|
| 180 |
+
GAMMA,
|
| 181 |
+
GAE_LAMBDA
|
| 182 |
+
)
|
| 183 |
+
ppo_batch = {}
|
| 184 |
+
for k in ['observations', 'actions', 'log_probs', 'returns', 'advantages', 'old_values']:
|
| 185 |
+
if rollout_data[k].ndim > 2:
|
| 186 |
+
ppo_batch[k] = rollout_data[k].reshape((-1,) + rollout_data[k].shape[2:])
|
| 187 |
+
else:
|
| 188 |
+
ppo_batch[k] = rollout_data[k].reshape((-1,))
|
| 189 |
+
|
| 190 |
+
policy_loss, value_loss, entropy = agent.learn(ppo_batch)
|
| 191 |
+
|
| 192 |
+
# 4. Logging
|
| 193 |
+
fps = current_timesteps / (time.time() - start_time)
|
| 194 |
+
with summary_writer.as_default():
|
| 195 |
+
tf.summary.scalar("train/policy_loss", policy_loss, step=current_timesteps)
|
| 196 |
+
tf.summary.scalar("train/value_loss", value_loss, step=current_timesteps)
|
| 197 |
+
tf.summary.scalar("train/entropy", entropy, step=current_timesteps)
|
| 198 |
+
tf.summary.scalar("perf/timesteps_per_second", fps, step=current_timesteps)
|
| 199 |
+
|
| 200 |
+
if episode_returns_list:
|
| 201 |
+
avg_rollout_return = np.mean(episode_returns_list)
|
| 202 |
+
tf.summary.scalar("rollout/rollout_average_return_rollout", avg_rollout_return, step=current_timesteps)
|
| 203 |
+
tf.summary.scalar("train/entropy_coef", ENTROPY_COEF, step=current_timesteps)
|
| 204 |
+
|
| 205 |
+
summary_writer.flush()
|
| 206 |
+
|
| 207 |
+
# CLI Reporting
|
| 208 |
+
if episode_returns_list:
|
| 209 |
+
avg_return = np.mean(episode_returns_list)
|
| 210 |
+
print(f"T: {current_timesteps:<10} | Avg Return: {avg_return:7.2f} | Loss: {policy_loss:7.4f} | FPS: {fps:4.0f}")
|
| 211 |
+
else:
|
| 212 |
+
print(f"T: {current_timesteps:<10} | No episodes completed this rollout. | Loss: {policy_loss:7.4f} | FPS: {fps:4.0f}")
|
| 213 |
+
|
| 214 |
+
# 5. Checkpointing & Guaranteed Logging (The Desperate Fix)
|
| 215 |
+
# Checkpoint is saved if the timesteps land on the defined frequency
|
| 216 |
+
if (current_timesteps - N_STEPS * NUM_ENVS) % CHECKPOINT_FREQ == 0:
|
| 217 |
+
|
| 218 |
+
# --- DIAGNOSTIC CHECK ---
|
| 219 |
+
print(f"--- DIAGNOSTIC: Total recorded episodes: {len(all_episode_returns)} ---")
|
| 220 |
+
# --- END DIAGNOSTIC CHECK ---
|
| 221 |
+
|
| 222 |
+
# NEW LOGGING: Calculate and log the overall average reward since start
|
| 223 |
+
if all_episode_returns:
|
| 224 |
+
# Use the last 200 episodes for a smooth, reliable moving average
|
| 225 |
+
overall_avg_return = np.mean(all_episode_returns[-200:])
|
| 226 |
+
with summary_writer.as_default():
|
| 227 |
+
# This new tag should ALWAYS populate in TensorBoard
|
| 228 |
+
tf.summary.scalar("rollout/overall_average_return_checkpoint", overall_avg_return, step=current_timesteps)
|
| 229 |
+
print(f"--- CHECKPOINT: Overall Avg Reward (Last 200): {overall_avg_return:.2f} ---")
|
| 230 |
+
|
| 231 |
+
# Save model and resume data
|
| 232 |
+
agent.save_model(SAVE_PATH, current_timesteps)
|
| 233 |
+
save_resume_data(resume_path, current_timesteps, current_episode)
|
| 234 |
+
print(f"Checkpoint saved at timestep: {current_timesteps}")
|
| 235 |
+
|
| 236 |
+
# Final Save after loop termination
|
| 237 |
+
print(f"\nTarget reached ({TOTAL_TIMESTEPS} timesteps). Saving final model.")
|
| 238 |
+
agent.save_model(SAVE_PATH, current_timesteps)
|
| 239 |
+
save_resume_data(resume_path, current_timesteps, current_episode)
|
| 240 |
+
|
| 241 |
+
vec_env.close()
|
| 242 |
+
summary_writer.close()
|
| 243 |
+
print("Training complete.")
|
| 244 |
+
|
| 245 |
+
if __name__ == "__main__":
|
| 246 |
+
train()
|
reward_shaping.py
ADDED
|
@@ -0,0 +1,113 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import gymnasium as gym
|
| 2 |
+
import numpy as np
|
| 3 |
+
from gymnasium.spaces import Box
|
| 4 |
+
from gymnasium import Wrapper
|
| 5 |
+
|
| 6 |
+
# --- CONFIGURATION FOR REWARD SHAPING ---
|
| 7 |
+
# Sustainable time penalty for fuel efficiency.
|
| 8 |
+
TIME_PENALTY = -0.05
|
| 9 |
+
# ----------------------------------------
|
| 10 |
+
|
| 11 |
+
class LunarLanderRewardShaping(Wrapper):
|
| 12 |
+
"""
|
| 13 |
+
Hyper-optimized structure for perfect, stable, centered, and fuel-efficient landing.
|
| 14 |
+
Uses catastrophic penalties to enforce terminal stabilization and centering.
|
| 15 |
+
"""
|
| 16 |
+
def __init__(self, env):
|
| 17 |
+
super().__init__(env)
|
| 18 |
+
self.last_shaping_reward = None
|
| 19 |
+
|
| 20 |
+
def step(self, action):
|
| 21 |
+
observation, reward, terminated, truncated, info = self.env.step(action)
|
| 22 |
+
|
| 23 |
+
# 1. Unpack relevant variables from observation
|
| 24 |
+
x_pos = observation[0]
|
| 25 |
+
y_pos = observation[1]
|
| 26 |
+
x_vel = observation[2]
|
| 27 |
+
y_vel = observation[3]
|
| 28 |
+
angle = observation[4]
|
| 29 |
+
angular_vel = observation[5]
|
| 30 |
+
|
| 31 |
+
# Leg contact boolean/float values
|
| 32 |
+
left_leg_contact = observation[6]
|
| 33 |
+
right_leg_contact = observation[7]
|
| 34 |
+
|
| 35 |
+
# Determine if the main engine (action 2) was fired
|
| 36 |
+
main_engine_fired = 1 if action == 2 else 0
|
| 37 |
+
|
| 38 |
+
# Determine if *any* thruster (actions 1, 2, or 3) was fired
|
| 39 |
+
any_thruster_fired = 1 if action != 0 else 0
|
| 40 |
+
|
| 41 |
+
# Proximity factor (close to 1.0 near the ground, 0.0 high up)
|
| 42 |
+
proximity_factor = 1.0 - y_pos
|
| 43 |
+
|
| 44 |
+
# 2. Calculate the total current shaping value
|
| 45 |
+
current_shaping_reward = 0.0
|
| 46 |
+
|
| 47 |
+
# A. CRITICAL FIX: Height-Weighted Horizontal Position Penalty
|
| 48 |
+
# Heavily penalizes being off-center (-100) ONLY when close to the ground.
|
| 49 |
+
# This forces the agent to center its X-position perfectly at low altitude.
|
| 50 |
+
current_shaping_reward += -100 * np.abs(x_pos) * proximity_factor
|
| 51 |
+
|
| 52 |
+
# B. Vertical velocity penalty near the ground (remains strong for soft landing)
|
| 53 |
+
current_shaping_reward += -40 * np.abs(y_vel) * proximity_factor
|
| 54 |
+
|
| 55 |
+
# C. Scaled back Horizontal speed penalty
|
| 56 |
+
# Relaxed (-5) to encourage smooth lateral coasting to save side-thruster fuel.
|
| 57 |
+
current_shaping_reward += -5 * np.abs(x_vel)
|
| 58 |
+
|
| 59 |
+
# D. Sharpened Angle Penalty
|
| 60 |
+
# Strong pressure (-10) for holding a perfectly vertical attitude.
|
| 61 |
+
current_shaping_reward += -10 * np.abs(angle)
|
| 62 |
+
|
| 63 |
+
# E. Penalize high spin rate
|
| 64 |
+
current_shaping_reward += -10 * np.abs(angular_vel)
|
| 65 |
+
|
| 66 |
+
# F. Main Engine usage penalty near the ground
|
| 67 |
+
current_shaping_reward += -20 * main_engine_fired * proximity_factor
|
| 68 |
+
|
| 69 |
+
# G. Relaxed Height Penalty (Pressure for descent, but not panic)
|
| 70 |
+
current_shaping_reward += -10 * y_pos
|
| 71 |
+
|
| 72 |
+
# H. CRITICAL FIX: Catastrophic Post-Contact Thrust Penalty
|
| 73 |
+
# Increased to -2500 to ensure ANY post-contact thrust is worse than failing the episode.
|
| 74 |
+
# This forces the agent to choose "Action 0: Do Nothing" immediately upon touchdown.
|
| 75 |
+
contact_sum = left_leg_contact + right_leg_contact
|
| 76 |
+
if contact_sum > 0:
|
| 77 |
+
current_shaping_reward += -3000 * any_thruster_fired * contact_sum
|
| 78 |
+
|
| 79 |
+
# I. Aggressive Landing Leg Use Incentive
|
| 80 |
+
current_shaping_reward += 15 * contact_sum
|
| 81 |
+
|
| 82 |
+
# 3. Calculate the differential shaping reward (reward for improvement)
|
| 83 |
+
if self.last_shaping_reward is not None:
|
| 84 |
+
shaping_reward_diff = current_shaping_reward - self.last_shaping_reward
|
| 85 |
+
# Clip differential reward to prevent massive, unstable jumps
|
| 86 |
+
reward += np.clip(shaping_reward_diff, -10.0, 10.0)
|
| 87 |
+
|
| 88 |
+
self.last_shaping_reward = current_shaping_reward
|
| 89 |
+
|
| 90 |
+
# --- SUSTAINABLE TIME PENALTY (Now -0.05) ---
|
| 91 |
+
reward += TIME_PENALTY
|
| 92 |
+
# ---------------------------------------------
|
| 93 |
+
|
| 94 |
+
return observation, reward, terminated, truncated, info
|
| 95 |
+
|
| 96 |
+
def reset(self, **kwargs):
|
| 97 |
+
"""Resets the environment and the shaping tracker."""
|
| 98 |
+
self.last_shaping_reward = None
|
| 99 |
+
return self.env.reset(**kwargs)
|
| 100 |
+
|
| 101 |
+
def render(self):
|
| 102 |
+
"""Passes render call to the base environment."""
|
| 103 |
+
return self.env.render()
|
| 104 |
+
|
| 105 |
+
def close(self):
|
| 106 |
+
"""Passes close call to the base environment."""
|
| 107 |
+
return self.env.close()
|
| 108 |
+
|
| 109 |
+
def __getattr__(self, name):
|
| 110 |
+
"""Delegate attribute access (like observation_space) to the wrapped environment."""
|
| 111 |
+
return getattr(self.env, name)
|
| 112 |
+
|
| 113 |
+
|
trained_agent.py
ADDED
|
@@ -0,0 +1,120 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import gymnasium as gym
|
| 2 |
+
import numpy as np
|
| 3 |
+
import tensorflow as tf
|
| 4 |
+
import os
|
| 5 |
+
import time
|
| 6 |
+
|
| 7 |
+
# Assuming config, agent, and utils are in the same directory
|
| 8 |
+
from config import ENV_ID, SEED, SAVE_PATH, TOTAL_TIMESTEPS
|
| 9 |
+
from agent import PPOAgent
|
| 10 |
+
# --- CRITICAL NEW IMPORT ---
|
| 11 |
+
# Import the custom wrapper class used during training
|
| 12 |
+
from reward_shaping import LunarLanderRewardShaping
|
| 13 |
+
# ---------------------------
|
| 14 |
+
|
| 15 |
+
# Ensure eager execution is enabled (usually the default)
|
| 16 |
+
tf.config.run_functions_eagerly(True)
|
| 17 |
+
|
| 18 |
+
def run_trained_agent(episodes=10):
|
| 19 |
+
"""
|
| 20 |
+
Loads the latest available trained PPO agent checkpoint and runs it for a specified number of episodes.
|
| 21 |
+
"""
|
| 22 |
+
print(f"--- Running Trained Agent on {ENV_ID} with Human Rendering ---")
|
| 23 |
+
|
| 24 |
+
# 1. Environment Setup: Use a single, non-vectorized environment for clean inference.
|
| 25 |
+
try:
|
| 26 |
+
# Create the base environment
|
| 27 |
+
env = gym.make(ENV_ID, render_mode="human")
|
| 28 |
+
|
| 29 |
+
# --- CRITICAL FIX: Wrap the environment as done during training ---
|
| 30 |
+
# The agent was trained on this wrapper, so it must be evaluated on it.
|
| 31 |
+
env = LunarLanderRewardShaping(env)
|
| 32 |
+
# -----------------------------------------------------------------
|
| 33 |
+
|
| 34 |
+
except Exception as e:
|
| 35 |
+
print(f"ERROR: Could not create environment {ENV_ID} or apply wrapper. Details: {e}")
|
| 36 |
+
return
|
| 37 |
+
|
| 38 |
+
obs_shape = env.observation_space.shape
|
| 39 |
+
|
| 40 |
+
# --- FIX 2: Explicitly pass the integer size (.n) instead of the Discrete object ---
|
| 41 |
+
action_size = env.action_space.n
|
| 42 |
+
# ---------------------------------------------------------------------------------
|
| 43 |
+
|
| 44 |
+
# Reset the single environment
|
| 45 |
+
current_obs, info = env.reset(seed=SEED)
|
| 46 |
+
|
| 47 |
+
# 2. Agent Initialization
|
| 48 |
+
# Pass the integer size of the action space.
|
| 49 |
+
agent = PPOAgent(obs_shape, action_size, TOTAL_TIMESTEPS)
|
| 50 |
+
|
| 51 |
+
# 3. Load Latest Checkpoint
|
| 52 |
+
latest_checkpoint = agent.checkpoint_manager.latest_checkpoint
|
| 53 |
+
|
| 54 |
+
if not latest_checkpoint:
|
| 55 |
+
print("\nERROR: Could not find any checkpoint in the designated save path.")
|
| 56 |
+
env.close()
|
| 57 |
+
return
|
| 58 |
+
try:
|
| 59 |
+
# Restore everything tracked by the checkpoint object
|
| 60 |
+
agent.checkpoint.restore(latest_checkpoint).expect_partial()
|
| 61 |
+
|
| 62 |
+
# Manually sync the NumPy-based RMS object after restoration
|
| 63 |
+
agent.obs_rms.mean = agent.rms_mean_var.numpy()
|
| 64 |
+
agent.obs_rms.var = agent.rms_var_var.numpy()
|
| 65 |
+
agent.obs_rms.count = agent.rms_count_var.numpy()
|
| 66 |
+
|
| 67 |
+
loaded_timesteps = int(os.path.basename(latest_checkpoint).split('-')[-1])
|
| 68 |
+
print(f"\nSuccessfully loaded latest checkpoint trained to T={loaded_timesteps}")
|
| 69 |
+
|
| 70 |
+
except Exception as e:
|
| 71 |
+
print(f"\nERROR: Failed to restore checkpoint at {latest_checkpoint}. Details: {e}")
|
| 72 |
+
print("Suggestion: Check the consistency of your environment setup (wrapper, action size).")
|
| 73 |
+
env.close()
|
| 74 |
+
return
|
| 75 |
+
|
| 76 |
+
# 4. Run Episodes
|
| 77 |
+
print(f"\nStarting {episodes} playback episodes...")
|
| 78 |
+
total_rewards = []
|
| 79 |
+
for i in range(episodes):
|
| 80 |
+
done = False
|
| 81 |
+
episode_reward = 0
|
| 82 |
+
step_count = 0
|
| 83 |
+
|
| 84 |
+
while not done:
|
| 85 |
+
# Call env.render() inside the loop to refresh the visualization
|
| 86 |
+
env.render()
|
| 87 |
+
|
| 88 |
+
# current_obs is now the correct shape (8,). Reshape to (1, 8) for the agent model input.
|
| 89 |
+
obs_to_agent = current_obs.reshape(1, *obs_shape)
|
| 90 |
+
|
| 91 |
+
# Select action
|
| 92 |
+
actions, _, _ = agent.select_action(obs_to_agent)
|
| 93 |
+
|
| 94 |
+
# Step the environment with the single action
|
| 95 |
+
action_to_step = actions[0]
|
| 96 |
+
current_obs, reward, terminated, truncated, info = env.step(action_to_step)
|
| 97 |
+
|
| 98 |
+
done = terminated or truncated
|
| 99 |
+
episode_reward += reward
|
| 100 |
+
step_count += 1
|
| 101 |
+
|
| 102 |
+
# Simple visualization pause
|
| 103 |
+
time.sleep(0.01)
|
| 104 |
+
|
| 105 |
+
total_rewards.append(episode_reward)
|
| 106 |
+
print(f"Episode {i+1}: Reward = {episode_reward:7.2f}, Steps = {step_count}")
|
| 107 |
+
|
| 108 |
+
# Reset for the next episode
|
| 109 |
+
current_obs, info = env.reset()
|
| 110 |
+
|
| 111 |
+
# 5. Cleanup and Summary
|
| 112 |
+
env.close()
|
| 113 |
+
|
| 114 |
+
if total_rewards:
|
| 115 |
+
print("-" * 30)
|
| 116 |
+
print(f"Average Reward over {episodes} episodes: {np.mean(total_rewards):7.2f}")
|
| 117 |
+
print("-" * 30)
|
| 118 |
+
|
| 119 |
+
if __name__ == "__main__":
|
| 120 |
+
run_trained_agent(episodes=10)
|
utils.py
ADDED
|
@@ -0,0 +1,102 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import gymnasium as gym
|
| 2 |
+
from gymnasium.vector import AsyncVectorEnv
|
| 3 |
+
import numpy as np
|
| 4 |
+
import tensorflow as tf
|
| 5 |
+
import json
|
| 6 |
+
import os
|
| 7 |
+
import time
|
| 8 |
+
|
| 9 |
+
# --- NEW: Import the reward shaping wrapper ---
|
| 10 |
+
from reward_shaping import LunarLanderRewardShaping
|
| 11 |
+
# ---------------------------------------------
|
| 12 |
+
from config import * # Import constants like ENV_ID, N_STEPS, NUM_ENVS, GAMMA, RESUME_FILE
|
| 13 |
+
|
| 14 |
+
# --- Helper Functions for Checkpointing ---
|
| 15 |
+
def save_resume_data(filepath, timesteps, episodes):
|
| 16 |
+
"""
|
| 17 |
+
Saves the current training state to a JSON file located at the specified filepath.
|
| 18 |
+
"""
|
| 19 |
+
data = {
|
| 20 |
+
"timesteps": timesteps,
|
| 21 |
+
"episode_count": episodes,
|
| 22 |
+
"date": time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
|
| 23 |
+
}
|
| 24 |
+
try:
|
| 25 |
+
with open(filepath, 'w') as f:
|
| 26 |
+
json.dump(data, f)
|
| 27 |
+
except Exception as e:
|
| 28 |
+
print(f"Error saving resume data to {filepath}: {e}")
|
| 29 |
+
|
| 30 |
+
def load_resume_data(filepath):
|
| 31 |
+
"""
|
| 32 |
+
Loads the last saved training state from the specified JSON file.
|
| 33 |
+
Returns (timesteps, episode_count).
|
| 34 |
+
"""
|
| 35 |
+
if os.path.exists(filepath):
|
| 36 |
+
try:
|
| 37 |
+
with open(filepath, 'r') as f:
|
| 38 |
+
data = json.load(f)
|
| 39 |
+
timesteps = data.get("timesteps", 0)
|
| 40 |
+
episodes = data.get("episode_count", 0)
|
| 41 |
+
return (timesteps, episodes)
|
| 42 |
+
except Exception as e:
|
| 43 |
+
print(f"Error loading resume data from {filepath}: {e}. Starting from scratch.")
|
| 44 |
+
return (0, 0)
|
| 45 |
+
|
| 46 |
+
# --- Environment Setup for Parallel Execution ---
|
| 47 |
+
def make_env(env_id, seed, idx, **kwargs):
|
| 48 |
+
"""
|
| 49 |
+
Creates a single environment instance with a unique seed and applies necessary wrappers.
|
| 50 |
+
"""
|
| 51 |
+
def thunk():
|
| 52 |
+
# 1. Create the base environment
|
| 53 |
+
env = gym.make(env_id, **kwargs)
|
| 54 |
+
|
| 55 |
+
# 2. Apply Custom Wrapper (LunarLanderRewardShaping)
|
| 56 |
+
# This is essential for LunarLander to learn a dense reward structure
|
| 57 |
+
env = LunarLanderRewardShaping(env)
|
| 58 |
+
|
| 59 |
+
# 3. Apply Standard Logging Wrapper (CRITICAL FIX FOR LOGGING)
|
| 60 |
+
# This wrapper tracks the episode reward ('r') and length ('l')
|
| 61 |
+
# and puts them into the 'infos' dictionary when the episode is done.
|
| 62 |
+
env = gym.wrappers.RecordEpisodeStatistics(env)
|
| 63 |
+
|
| 64 |
+
# 4. Apply seeding to the final wrapped environment
|
| 65 |
+
env.action_space.seed(seed + idx)
|
| 66 |
+
env.observation_space.seed(seed + idx)
|
| 67 |
+
|
| 68 |
+
# The result 'env' is the fully wrapped, final environment instance
|
| 69 |
+
return env
|
| 70 |
+
return thunk
|
| 71 |
+
|
| 72 |
+
def make_parallel_envs(env_id, num_envs, seed):
|
| 73 |
+
"""
|
| 74 |
+
Creates multiple environments and wraps them in an AsyncVectorEnv.
|
| 75 |
+
"""
|
| 76 |
+
env_fns = [make_env(env_id, seed, i) for i in range(num_envs)]
|
| 77 |
+
return AsyncVectorEnv(env_fns)
|
| 78 |
+
|
| 79 |
+
# --- GAE Calculation ---
|
| 80 |
+
def calculate_gae(rewards, values, terminated, truncated, next_value, gamma=GAMMA, gae_lambda=GAE_LAMBDA):
|
| 81 |
+
"""
|
| 82 |
+
Calculates Generalized Advantage Estimation (GAE) and Returns (R) from rollout data.
|
| 83 |
+
"""
|
| 84 |
+
advantages = np.zeros_like(rewards, dtype=np.float32)
|
| 85 |
+
last_gae_lambda = 0
|
| 86 |
+
|
| 87 |
+
for t in reversed(range(N_STEPS)):
|
| 88 |
+
done_mask = 1.0 - (terminated[t] | truncated[t]).astype(np.float32)
|
| 89 |
+
if t == N_STEPS - 1:
|
| 90 |
+
next_non_terminal_value = next_value
|
| 91 |
+
next_value_actual = values[t]
|
| 92 |
+
else:
|
| 93 |
+
next_non_terminal_value = values[t + 1]
|
| 94 |
+
next_value_actual = values[t]
|
| 95 |
+
|
| 96 |
+
delta = rewards[t] + gamma * next_non_terminal_value * done_mask - next_value_actual
|
| 97 |
+
|
| 98 |
+
advantages[t] = delta + gamma * gae_lambda * done_mask * last_gae_lambda
|
| 99 |
+
last_gae_lambda = advantages[t]
|
| 100 |
+
|
| 101 |
+
returns = advantages + values
|
| 102 |
+
return advantages, returns
|