privateboss commited on
Commit
688b303
·
verified ·
1 Parent(s): 69ec84f

Upload 6 files

Browse files
Files changed (6) hide show
  1. agent.py +278 -0
  2. config.py +42 -0
  3. main.py +246 -0
  4. reward_shaping.py +113 -0
  5. trained_agent.py +120 -0
  6. utils.py +102 -0
agent.py ADDED
@@ -0,0 +1,278 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import tensorflow as tf
2
+ from keras.layers import Dense, Normalization, Input
3
+ from keras.models import Model
4
+ import tensorflow_probability as tfp
5
+ import numpy as np
6
+ import os
7
+ from config import *
8
+
9
+ # Helper to normalize observations
10
+ class RunningMeanStd:
11
+ def __init__(self, shape):
12
+ self.mean = np.zeros(shape, dtype=np.float32)
13
+ self.var = np.ones(shape, dtype=np.float32)
14
+ self.count = 1e-4
15
+
16
+ def update(self, x):
17
+ batch_mean = np.mean(x, axis=0)
18
+ batch_var = np.var(x, axis=0)
19
+ batch_count = x.shape[0]
20
+
21
+ self.update_from_moments(batch_mean, batch_var, batch_count)
22
+
23
+ def update_from_moments(self, batch_mean, batch_var, batch_count):
24
+ delta = batch_mean - self.mean
25
+ total_count = self.count + batch_count
26
+
27
+ new_mean = self.mean + delta * batch_count / total_count
28
+
29
+ m_a = self.var * self.count
30
+ m_b = batch_var * batch_count
31
+ m2 = m_a + m_b + np.square(delta) * self.count * batch_count / total_count
32
+
33
+ new_var = m2 / total_count
34
+
35
+ self.mean = new_mean
36
+ self.var = new_var
37
+ self.count = total_count
38
+
39
+ class PPOAgent:
40
+ def __init__(self, obs_shape, action_size, total_timesteps):
41
+ self.obs_shape = obs_shape
42
+ self.action_size = action_size
43
+
44
+ # Networks
45
+ self.policy = self._build_policy_model(obs_shape, action_size)
46
+ self.value = self._build_value_model(obs_shape)
47
+
48
+ # Learning Rate Schedule (Crucial for PPO)
49
+ self.lr_schedule = tf.keras.optimizers.schedules.PolynomialDecay(
50
+ initial_learning_rate=LEARNING_RATE,
51
+ decay_steps=total_timesteps * PPO_EPOCHS / (N_STEPS * NUM_ENVS),
52
+ end_learning_rate=0.0
53
+ )
54
+ self.optimizer = tf.keras.optimizers.Adam(learning_rate=self.lr_schedule, epsilon=1e-5)
55
+
56
+ # Observation Normalizer
57
+ self.obs_rms = RunningMeanStd(shape=obs_shape)
58
+
59
+ # --- CHECKPOINTING SETUP ---
60
+ # Wrap RMS parameters in tf.Variable so they can be tracked and saved
61
+ self.rms_mean_var = tf.Variable(self.obs_rms.mean, dtype=tf.float32, name="obs_rms_mean")
62
+ self.rms_var_var = tf.Variable(self.obs_rms.var, dtype=tf.float32, name="obs_rms_var")
63
+ self.rms_count_var = tf.Variable(self.obs_rms.count, dtype=tf.float32, name="obs_rms_count")
64
+
65
+ self.checkpoint = tf.train.Checkpoint(
66
+ policy=self.policy,
67
+ value=self.value,
68
+ optimizer=self.optimizer,
69
+ obs_rms_mean=self.rms_mean_var,
70
+ obs_rms_var=self.rms_var_var,
71
+ obs_rms_count=self.rms_count_var
72
+ )
73
+ self.checkpoint_manager = tf.train.CheckpointManager(
74
+ self.checkpoint, os.path.join(SAVE_PATH, 'tf_checkpoints'), max_to_keep=1000
75
+ )
76
+
77
+ def _build_policy_model(self, obs_shape, action_size):
78
+ inputs = tf.keras.Input(shape=obs_shape)
79
+ x = Dense(64, activation='relu')(inputs)
80
+ x = Dense(64, activation='relu')(x)
81
+ logits = Dense(action_size)(x)
82
+
83
+ return tf.keras.Model(inputs=inputs, outputs=logits, name="policy_model")
84
+
85
+ def _build_value_model(self, obs_shape):
86
+ inputs = tf.keras.Input(shape=obs_shape)
87
+ x = Dense(64, activation='relu')(inputs)
88
+ x = Dense(64, activation='relu')(x)
89
+ value = Dense(1)(x)
90
+ return tf.keras.Model(inputs=inputs, outputs=value, name="value_model")
91
+
92
+ def adapt_normalization(self, initial_observations):
93
+ """Updates the observation normalizer using initial data and saves state to checkpoint variables."""
94
+ self.obs_rms.update(initial_observations)
95
+ # Update checkpoint variables for persistence
96
+ self.rms_mean_var.assign(self.obs_rms.mean)
97
+ self.rms_var_var.assign(self.obs_rms.var)
98
+ self.rms_count_var.assign(self.obs_rms.count)
99
+
100
+ def normalize_obs(self, obs):
101
+ """
102
+ Applies normalization. This runs in Eager mode (NumPy input) or Graph mode (Tensor input).
103
+ The source of RMS parameters is selected based on the input type.
104
+ """
105
+ is_tensor = tf.is_tensor(obs)
106
+
107
+ # Determine RMS source
108
+ # Use tf.Variables if input is a Tensor (for learn_step)
109
+ # Use NumPy arrays if input is a NumPy array (for select_action/rollout prep)
110
+ rms_mean = self.rms_mean_var if is_tensor else self.obs_rms.mean
111
+ rms_var = self.rms_var_var if is_tensor else self.obs_rms.var
112
+
113
+ # Convert input to float32 if it's not already a tensor
114
+ if not is_tensor:
115
+ obs = obs.astype(np.float32)
116
+
117
+ # Normalization calculation (must use tf functions)
118
+ normalized_obs = (obs - rms_mean) / tf.sqrt(rms_var + 1e-8)
119
+
120
+ # Clipping (using tf.clip_by_value)
121
+ normalized_obs = tf.clip_by_value(normalized_obs, -10.0, 10.0)
122
+
123
+ # Return NumPy array if the input was NumPy (used for rollout collection and saving)
124
+ if not is_tensor:
125
+ # We explicitly convert the result of the TF operations back to NumPy here
126
+ return normalized_obs.numpy()
127
+
128
+ return normalized_obs
129
+
130
+
131
+ # --- FIX: Removed @tf.function here to allow .numpy() calls in eager mode ---
132
+ def select_action(self, obs):
133
+ """Selects action, computes value, and log_prob for the given observation in Eager Mode."""
134
+
135
+ # 1. Convert incoming NumPy array to a Tensor for the model forward pass
136
+ obs_tensor = tf.convert_to_tensor(obs, dtype=tf.float32)
137
+
138
+ # 2. Normalize the observation tensor
139
+ normalized_obs = self.normalize_obs(obs_tensor)
140
+
141
+ # Forward pass (runs efficiently even without @tf.function due to Keras's tracing)
142
+ logits = self.policy(normalized_obs, training=False)
143
+ values = self.value(normalized_obs, training=False)
144
+
145
+ # Create categorical distribution and sample
146
+ distribution = tfp.distributions.Categorical(logits=logits)
147
+ actions = distribution.sample()
148
+
149
+ # Compute log probability of the sampled action
150
+ log_probs = distribution.log_prob(actions)
151
+
152
+ # 3. Convert EagerTensors back to NumPy arrays
153
+ actions_np = actions.numpy().astype(np.int64) # This now works in Eager Mode!
154
+ values_np = values.numpy().flatten()
155
+ log_probs_np = log_probs.numpy()
156
+
157
+ return actions_np, values_np, log_probs_np
158
+
159
+ # The learn_step remains decorated with @tf.function
160
+ @tf.function
161
+ def learn_step(self, obs, actions, old_log_probs, returns, advantages, old_values):
162
+ """Performs a single PPO optimization step."""
163
+ with tf.GradientTape() as tape:
164
+ # 1. Forward Pass
165
+ logits = self.policy(obs, training=True)
166
+ values = self.value(obs, training=True)
167
+
168
+ # 2. Compute probability ratio
169
+ distribution = tfp.distributions.Categorical(logits=logits)
170
+ log_probs = distribution.log_prob(actions)
171
+ ratio = tf.exp(log_probs - old_log_probs)
172
+
173
+ # 3. Compute Value Loss
174
+ values_clipped = old_values + tf.clip_by_value(values - old_values, -CLIP_RANGE, CLIP_RANGE)
175
+ value_loss1 = tf.square(returns - values)
176
+ value_loss2 = tf.square(returns - values_clipped)
177
+ value_loss = 0.5 * tf.reduce_mean(tf.maximum(value_loss1, value_loss2))
178
+
179
+ # 4. Compute Policy Loss
180
+ pg_loss1 = -advantages * ratio
181
+ pg_loss2 = -advantages * tf.clip_by_value(ratio, 1.0 - CLIP_RANGE, 1.0 + CLIP_RANGE)
182
+ policy_loss = tf.reduce_mean(tf.maximum(pg_loss1, pg_loss2))
183
+
184
+ # 5. Compute Entropy Loss
185
+ entropy = tf.reduce_mean(distribution.entropy())
186
+
187
+ # 6. Total Loss
188
+ total_loss = policy_loss + VALUE_COEF * value_loss - ENTROPY_COEF * entropy
189
+
190
+ # Apply gradients
191
+ grads = tape.gradient(total_loss, self.policy.trainable_variables + self.value.trainable_variables)
192
+ grads, _ = tf.clip_by_global_norm(grads, MAX_GRAD_NORM)
193
+ self.optimizer.apply_gradients(zip(grads, self.policy.trainable_variables + self.value.trainable_variables))
194
+
195
+ return -policy_loss, value_loss, entropy
196
+
197
+ def learn(self, ppo_batch):
198
+ """PPO update loop with data preparation and mini-batching."""
199
+
200
+ # 1. Update RMS and sync variables (in eager mode)
201
+ self.obs_rms.update(ppo_batch['observations'])
202
+ self.rms_mean_var.assign(self.obs_rms.mean)
203
+ self.rms_var_var.assign(self.obs_rms.var)
204
+ self.rms_count_var.assign(self.obs_rms.count)
205
+
206
+ # 2. Normalize observations using NumPy-based RMS object
207
+ obs = self.normalize_obs(ppo_batch['observations'])
208
+
209
+ actions = ppo_batch['actions']
210
+ old_log_probs = ppo_batch['log_probs']
211
+ returns = ppo_batch['returns']
212
+ advantages = ppo_batch['advantages']
213
+ old_values = ppo_batch['old_values']
214
+
215
+ # Normalizing advantages is critical for PPO stability
216
+ advantages = (advantages - np.mean(advantages)) / (np.std(advantages) + 1e-8)
217
+
218
+ # 3. Cast all prepared data to TensorFlow tensors for graph execution
219
+ obs_tensor = tf.convert_to_tensor(obs, dtype=tf.float32)
220
+ actions_tensor = tf.convert_to_tensor(actions, dtype=np.int64)
221
+ old_log_probs_tensor = tf.convert_to_tensor(old_log_probs, dtype=tf.float32)
222
+ returns_tensor = tf.convert_to_tensor(returns.flatten(), dtype=tf.float32)
223
+ advantages_tensor = tf.convert_to_tensor(advantages.flatten(), dtype=tf.float32)
224
+ old_values_tensor = tf.convert_to_tensor(old_values.flatten(), dtype=tf.float32)
225
+
226
+ batch_size = obs_tensor.shape[0]
227
+ minibatch_size = batch_size // NUM_MINIBATCHES
228
+
229
+ policy_losses, value_losses, entropies = [], [], []
230
+
231
+ for epoch in range(PPO_EPOCHS):
232
+ # Shuffle indices
233
+ indices = tf.range(batch_size)
234
+ shuffled_indices = tf.random.shuffle(indices)
235
+
236
+ for start in range(0, batch_size, minibatch_size):
237
+ end = start + minibatch_size
238
+ minibatch_indices = shuffled_indices[start:end]
239
+
240
+ # Gather minibatch data (this happens efficiently on CPU/GPU)
241
+ mb_obs = tf.gather(obs_tensor, minibatch_indices)
242
+ mb_actions = tf.gather(actions_tensor, minibatch_indices)
243
+ mb_old_log_probs = tf.gather(old_log_probs_tensor, minibatch_indices)
244
+ mb_returns = tf.gather(returns_tensor, minibatch_indices)
245
+ mb_advantages = tf.gather(advantages_tensor, minibatch_indices)
246
+ mb_old_values = tf.gather(old_values_tensor, minibatch_indices)
247
+
248
+ # Perform the learning step (runs in the traced graph)
249
+ p_loss, v_loss, entropy = self.learn_step(
250
+ mb_obs, mb_actions, mb_old_log_probs, mb_returns, mb_advantages, mb_old_values
251
+ )
252
+ policy_losses.append(p_loss.numpy())
253
+ value_losses.append(v_loss.numpy())
254
+ entropies.append(entropy.numpy())
255
+
256
+ return np.mean(policy_losses), np.mean(value_losses), np.mean(entropies)
257
+
258
+ def save_model(self, save_dir, timesteps):
259
+ """Saves the full checkpoint using the manager."""
260
+ self.checkpoint_manager.save(checkpoint_number=timesteps)
261
+
262
+ def load_model(self, save_dir, timesteps):
263
+ """Loads the last successful checkpoint."""
264
+ latest_checkpoint = self.checkpoint_manager.latest_checkpoint
265
+
266
+ if latest_checkpoint:
267
+ print(f"Restoring checkpoint from {latest_checkpoint}...")
268
+ # Restore everything tracked by the checkpoint object
269
+ self.checkpoint.restore(latest_checkpoint).expect_partial()
270
+
271
+ # --- IMPORTANT: Update self.obs_rms with restored tensor values ---
272
+ self.obs_rms.mean = self.rms_mean_var.numpy()
273
+ self.obs_rms.var = self.rms_var_var.numpy()
274
+ self.obs_rms.count = self.rms_count_var.numpy()
275
+
276
+ print("Model, Optimizer, and Normalizer restored successfully.")
277
+ else:
278
+ raise FileNotFoundError(f"No checkpoint found in {self.checkpoint_manager.directory}")
config.py ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import tensorflow as tf
2
+ import os
3
+
4
+ # --- Environment Configuration ---
5
+ ENV_ID = "LunarLander-v3"
6
+ SEED = 123
7
+ NUM_ENVS = 12
8
+
9
+ # --- PPO Hyperparameters ---
10
+ TOTAL_TIMESTEPS = 15_000_000
11
+ N_STEPS = 4096 # Number of steps per environment per rollout (2048 * 12 = 24,576 total steps per update)
12
+ GAMMA = 0.99
13
+ GAE_LAMBDA = 0.95
14
+ PPO_EPOCHS = 15
15
+ NUM_MINIBATCHES = 4
16
+ CLIP_RANGE = 0.1
17
+ LEARNING_RATE = 3e-4
18
+ RMS_WARMUP_STEPS = 5000
19
+
20
+ # --- Loss Coefficients ---
21
+ VALUE_COEF = 0.5
22
+ ENTROPY_COEF = 0.1
23
+ MAX_GRAD_NORM = 0.5
24
+
25
+ # --- Hardware and Logging ---
26
+ DEVICE = 'GPU' if tf.config.list_physical_devices('GPU') else 'CPU'
27
+ if DEVICE == 'GPU':
28
+ gpus = tf.config.experimental.list_physical_devices('GPU')
29
+ if gpus:
30
+ try:
31
+ for gpu in gpus:
32
+ tf.config.experimental.set_memory_growth(gpu, True)
33
+ except RuntimeError as e:
34
+ print(e)
35
+ LOG_DIR = f"./Lunar_Lander_Discrete_logs/ppo_{ENV_ID.lower()}"
36
+
37
+ # --- Checkpointing and Resuming ---
38
+ SAVE_PATH_ROOT = "./Lunar_Lander_Discrete_models"
39
+ SAVE_PATH = os.path.join(SAVE_PATH_ROOT, f"ppo_{ENV_ID.lower()}")
40
+ RESUME_FILE = f"ppo_{ENV_ID.lower()}_resume.json"
41
+ # We save the checkpoint every 21 rollouts
42
+ CHECKPOINT_FREQ = N_STEPS * NUM_ENVS * 21 # Current value: 540,672 timesteps, after initial 49,152 that include RMS_WARMUP_STEPS
main.py ADDED
@@ -0,0 +1,246 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import tensorflow as tf
2
+ import numpy as np
3
+ import os
4
+ import time
5
+ from datetime import datetime
6
+ # Import all constants
7
+ from config import *
8
+ from utils import make_parallel_envs, save_resume_data, load_resume_data
9
+ from agent import PPOAgent
10
+
11
+ # --- GAE Calculation ---
12
+ def compute_gae(rewards, values, dones, next_value, gamma, gae_lambda):
13
+ rewards = rewards.astype(np.float32)
14
+ values = values.astype(np.float32)
15
+ dones = dones.astype(np.float32)
16
+ next_value = next_value.astype(np.float32)
17
+ advantages = np.zeros_like(rewards, dtype=np.float32)
18
+ last_gae_lambda = np.zeros_like(rewards[0], dtype=np.float32)
19
+
20
+ # Concatenate current values and next_value for V(s')
21
+ values = np.concatenate([values, next_value[None, :]], axis=0)
22
+
23
+ for t in reversed(range(N_STEPS)):
24
+ # GAE calculation (delta = R + gamma * V(s') * (1-dones) - V(s))
25
+ delta = rewards[t] + gamma * values[t + 1] * (1 - dones[t]) - values[t]
26
+
27
+ # GAE propagation (A_t = delta_t + gamma * lambda * (1 - done_t) * A_{t+1})
28
+ last_gae_lambda = delta + gamma * gae_lambda * (1 - dones[t]) * last_gae_lambda
29
+ advantages[t] = last_gae_lambda
30
+
31
+ # R = A + V
32
+ returns = advantages + values[:-1]
33
+ return advantages, returns
34
+ # --- End GAE Calculation ---
35
+
36
+ def train():
37
+ print(f"--- Running on {DEVICE} ---")
38
+
39
+ os.makedirs(LOG_DIR, exist_ok=True)
40
+ os.makedirs(SAVE_PATH, exist_ok=True)
41
+ summary_writer = tf.summary.create_file_writer(LOG_DIR)
42
+
43
+ vec_env = make_parallel_envs(ENV_ID, NUM_ENVS, SEED)
44
+ obs_shape = vec_env.single_observation_space.shape
45
+ action_size = vec_env.single_action_space.n
46
+
47
+ ACTION_DTYPE = vec_env.action_space.dtype
48
+ print(f"Action space DTYPE retrieved: {ACTION_DTYPE}. Forcing agent output to this type.")
49
+
50
+ agent = PPOAgent(obs_shape, action_size, TOTAL_TIMESTEPS)
51
+
52
+ resume_path = os.path.join(os.path.dirname(SAVE_PATH) or '.', RESUME_FILE)
53
+ initial_timesteps, initial_episode = load_resume_data(resume_path)
54
+ current_timesteps = initial_timesteps
55
+ current_episode = initial_episode
56
+ obs, info = vec_env.reset(seed=SEED)
57
+
58
+ # --- CRITICAL FIX FOR LOGGING RELIABILITY: Global Tracking List ---
59
+ all_episode_returns = []
60
+ # --- END CRITICAL FIX ---
61
+
62
+ # --- INITIAL NORMALIZATION ADAPTATION ---
63
+ if current_timesteps == 0:
64
+ print("Adapting observation normalizer for stability...")
65
+ initial_observations = []
66
+ current_obs = obs
67
+ for _ in range(RMS_WARMUP_STEPS // NUM_ENVS):
68
+ action_array = vec_env.action_space.sample()
69
+ actions_to_step = np.ascontiguousarray(action_array.reshape(NUM_ENVS).astype(ACTION_DTYPE))
70
+ current_obs, _, _, _, _ = vec_env.step(actions_to_step)
71
+ initial_observations.append(current_obs)
72
+
73
+ initial_observations = np.array(initial_observations).reshape(-1, obs_shape[0])
74
+ agent.adapt_normalization(initial_observations)
75
+
76
+ obs, info = vec_env.reset(seed=SEED)
77
+ print("Normalizer adapted. Starting training.")
78
+
79
+ elif initial_timesteps > 0:
80
+ try:
81
+ agent.load_model(SAVE_PATH, initial_timesteps)
82
+ print(f"Resumed training from timestep: {initial_timesteps}")
83
+ except Exception as e:
84
+ print(f"Error loading model weights at {initial_timesteps}: {e}. Starting from scratch.")
85
+ current_timesteps = 0
86
+ current_episode = 0
87
+ obs, info = vec_env.reset(seed=SEED)
88
+
89
+ start_time = time.time()
90
+
91
+ base_timesteps = current_timesteps
92
+
93
+ while current_timesteps < TOTAL_TIMESTEPS:
94
+ rollout_data = {'observations': [], 'actions': [], 'log_probs': [],
95
+ 'rewards': [], 'values': [], 'dones': [], 'old_values': []}
96
+
97
+ episode_returns_list = [] # Tracks returns for CLI output
98
+
99
+ for step in range(N_STEPS):
100
+ actions, values, log_probs = agent.select_action(obs)
101
+
102
+ actions_to_step = np.ascontiguousarray(actions.reshape(NUM_ENVS).astype(ACTION_DTYPE))
103
+
104
+ new_obs, rewards, terminateds, truncateds, infos = vec_env.step(actions_to_step)
105
+ dones = np.logical_or(terminateds, truncateds)
106
+
107
+ # --- DEBUG PRINT ---
108
+ #if np.any(dones):
109
+ #print(f"DEBUG STEP: Done signal received! Raw infos keys: {infos.keys()}")
110
+ #if 'final_info' in infos:
111
+ #print(f"DEBUG STEP: final_info key IS present.")
112
+ #elif 'episode' in infos:
113
+ #print(f"DEBUG STEP: 'final_info' key IS MISSING, but raw 'episode' key IS present. Attempting fallback extraction.")
114
+ #else:
115
+ #print("DEBUG STEP: 'final_info' key IS MISSING from infos dictionary.")
116
+ # --- END DEBUG PRINT ---
117
+
118
+ # Store data
119
+ rollout_data['observations'].append(np.array(obs).copy())
120
+ rollout_data['actions'].append(actions_to_step.copy())
121
+ rollout_data['log_probs'].append(np.array(log_probs).copy())
122
+ rollout_data['values'].append(np.array(values).copy())
123
+ rollout_data['old_values'].append(np.array(values).copy())
124
+ rollout_data['rewards'].append(np.array(rewards).copy())
125
+ rollout_data['dones'].append(np.array(dones).copy())
126
+
127
+ obs = new_obs
128
+
129
+ # --- ASYNCVECTORENV Logging Mechanism (Extract data) ---
130
+ log_step = base_timesteps + ((step + 1) * NUM_ENVS)
131
+
132
+ # 1. STANDARD EXTRACTION (EXPECTED BEHAVIOR: final_info is a list of dicts)
133
+ if 'final_info' in infos:
134
+ env_infos_to_process = infos['final_info']
135
+
136
+ # 2. FALLBACK EXTRACTION (YOUR OBSERVED BEHAVIOR: raw 'episode' key is present)
137
+ # This is less common but necessary for your specific environment output
138
+ elif 'episode' in infos:
139
+ # We assume 'episode' is a dict containing lists of returns/lengths for finished episodes
140
+ # The 'episode' key is often where single-env stats get dumped if not aggregated.
141
+ env_infos_to_process = [{'episode': {'r': r, 'l': l}}
142
+ for r, l in zip(infos['episode']['r'], infos['episode']['l'])]
143
+ else:
144
+ env_infos_to_process = []
145
+
146
+ for env_info in env_infos_to_process:
147
+ # Must be a dictionary and not None
148
+ if env_info is not None and isinstance(env_info, dict):
149
+ # The episode stats are nested under the 'episode' key
150
+ if 'episode' in env_info and isinstance(env_info['episode'], dict):
151
+ episode_stats = env_info['episode']
152
+
153
+ # Explicitly check for 'r' (return) and 'l' (length) keys
154
+ if 'r' in episode_stats and 'l' in episode_stats:
155
+ episode_return = float(episode_stats['r'])
156
+ episode_length = int(episode_stats['l'])
157
+
158
+ # CRITICAL: Populate both lists
159
+ episode_returns_list.append(episode_return)
160
+ all_episode_returns.append(episode_return) # <-- Global List used for checkpoint logging
161
+ current_episode += 1
162
+
163
+ with summary_writer.as_default():
164
+ tf.summary.scalar("rollout/episode_return_single", episode_return, step=log_step)
165
+ tf.summary.scalar("rollout/episode_length_single", episode_length, step=log_step)
166
+ # --- END ASYNCVECTORENV Logging Mechanism ---
167
+
168
+ # Increment total timesteps after the rollout is collected
169
+ current_timesteps += N_STEPS * NUM_ENVS
170
+ base_timesteps = current_timesteps
171
+
172
+ # --- PPO UPDATE ---
173
+ rollout_data = {k: np.asarray(v) for k, v in rollout_data.items()}
174
+ _, next_values, _ = agent.select_action(obs)
175
+ rollout_data['advantages'], rollout_data['returns'] = compute_gae(
176
+ rollout_data['rewards'],
177
+ rollout_data['values'],
178
+ rollout_data['dones'],
179
+ next_values,
180
+ GAMMA,
181
+ GAE_LAMBDA
182
+ )
183
+ ppo_batch = {}
184
+ for k in ['observations', 'actions', 'log_probs', 'returns', 'advantages', 'old_values']:
185
+ if rollout_data[k].ndim > 2:
186
+ ppo_batch[k] = rollout_data[k].reshape((-1,) + rollout_data[k].shape[2:])
187
+ else:
188
+ ppo_batch[k] = rollout_data[k].reshape((-1,))
189
+
190
+ policy_loss, value_loss, entropy = agent.learn(ppo_batch)
191
+
192
+ # 4. Logging
193
+ fps = current_timesteps / (time.time() - start_time)
194
+ with summary_writer.as_default():
195
+ tf.summary.scalar("train/policy_loss", policy_loss, step=current_timesteps)
196
+ tf.summary.scalar("train/value_loss", value_loss, step=current_timesteps)
197
+ tf.summary.scalar("train/entropy", entropy, step=current_timesteps)
198
+ tf.summary.scalar("perf/timesteps_per_second", fps, step=current_timesteps)
199
+
200
+ if episode_returns_list:
201
+ avg_rollout_return = np.mean(episode_returns_list)
202
+ tf.summary.scalar("rollout/rollout_average_return_rollout", avg_rollout_return, step=current_timesteps)
203
+ tf.summary.scalar("train/entropy_coef", ENTROPY_COEF, step=current_timesteps)
204
+
205
+ summary_writer.flush()
206
+
207
+ # CLI Reporting
208
+ if episode_returns_list:
209
+ avg_return = np.mean(episode_returns_list)
210
+ print(f"T: {current_timesteps:<10} | Avg Return: {avg_return:7.2f} | Loss: {policy_loss:7.4f} | FPS: {fps:4.0f}")
211
+ else:
212
+ print(f"T: {current_timesteps:<10} | No episodes completed this rollout. | Loss: {policy_loss:7.4f} | FPS: {fps:4.0f}")
213
+
214
+ # 5. Checkpointing & Guaranteed Logging (The Desperate Fix)
215
+ # Checkpoint is saved if the timesteps land on the defined frequency
216
+ if (current_timesteps - N_STEPS * NUM_ENVS) % CHECKPOINT_FREQ == 0:
217
+
218
+ # --- DIAGNOSTIC CHECK ---
219
+ print(f"--- DIAGNOSTIC: Total recorded episodes: {len(all_episode_returns)} ---")
220
+ # --- END DIAGNOSTIC CHECK ---
221
+
222
+ # NEW LOGGING: Calculate and log the overall average reward since start
223
+ if all_episode_returns:
224
+ # Use the last 200 episodes for a smooth, reliable moving average
225
+ overall_avg_return = np.mean(all_episode_returns[-200:])
226
+ with summary_writer.as_default():
227
+ # This new tag should ALWAYS populate in TensorBoard
228
+ tf.summary.scalar("rollout/overall_average_return_checkpoint", overall_avg_return, step=current_timesteps)
229
+ print(f"--- CHECKPOINT: Overall Avg Reward (Last 200): {overall_avg_return:.2f} ---")
230
+
231
+ # Save model and resume data
232
+ agent.save_model(SAVE_PATH, current_timesteps)
233
+ save_resume_data(resume_path, current_timesteps, current_episode)
234
+ print(f"Checkpoint saved at timestep: {current_timesteps}")
235
+
236
+ # Final Save after loop termination
237
+ print(f"\nTarget reached ({TOTAL_TIMESTEPS} timesteps). Saving final model.")
238
+ agent.save_model(SAVE_PATH, current_timesteps)
239
+ save_resume_data(resume_path, current_timesteps, current_episode)
240
+
241
+ vec_env.close()
242
+ summary_writer.close()
243
+ print("Training complete.")
244
+
245
+ if __name__ == "__main__":
246
+ train()
reward_shaping.py ADDED
@@ -0,0 +1,113 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gymnasium as gym
2
+ import numpy as np
3
+ from gymnasium.spaces import Box
4
+ from gymnasium import Wrapper
5
+
6
+ # --- CONFIGURATION FOR REWARD SHAPING ---
7
+ # Sustainable time penalty for fuel efficiency.
8
+ TIME_PENALTY = -0.05
9
+ # ----------------------------------------
10
+
11
+ class LunarLanderRewardShaping(Wrapper):
12
+ """
13
+ Hyper-optimized structure for perfect, stable, centered, and fuel-efficient landing.
14
+ Uses catastrophic penalties to enforce terminal stabilization and centering.
15
+ """
16
+ def __init__(self, env):
17
+ super().__init__(env)
18
+ self.last_shaping_reward = None
19
+
20
+ def step(self, action):
21
+ observation, reward, terminated, truncated, info = self.env.step(action)
22
+
23
+ # 1. Unpack relevant variables from observation
24
+ x_pos = observation[0]
25
+ y_pos = observation[1]
26
+ x_vel = observation[2]
27
+ y_vel = observation[3]
28
+ angle = observation[4]
29
+ angular_vel = observation[5]
30
+
31
+ # Leg contact boolean/float values
32
+ left_leg_contact = observation[6]
33
+ right_leg_contact = observation[7]
34
+
35
+ # Determine if the main engine (action 2) was fired
36
+ main_engine_fired = 1 if action == 2 else 0
37
+
38
+ # Determine if *any* thruster (actions 1, 2, or 3) was fired
39
+ any_thruster_fired = 1 if action != 0 else 0
40
+
41
+ # Proximity factor (close to 1.0 near the ground, 0.0 high up)
42
+ proximity_factor = 1.0 - y_pos
43
+
44
+ # 2. Calculate the total current shaping value
45
+ current_shaping_reward = 0.0
46
+
47
+ # A. CRITICAL FIX: Height-Weighted Horizontal Position Penalty
48
+ # Heavily penalizes being off-center (-100) ONLY when close to the ground.
49
+ # This forces the agent to center its X-position perfectly at low altitude.
50
+ current_shaping_reward += -100 * np.abs(x_pos) * proximity_factor
51
+
52
+ # B. Vertical velocity penalty near the ground (remains strong for soft landing)
53
+ current_shaping_reward += -40 * np.abs(y_vel) * proximity_factor
54
+
55
+ # C. Scaled back Horizontal speed penalty
56
+ # Relaxed (-5) to encourage smooth lateral coasting to save side-thruster fuel.
57
+ current_shaping_reward += -5 * np.abs(x_vel)
58
+
59
+ # D. Sharpened Angle Penalty
60
+ # Strong pressure (-10) for holding a perfectly vertical attitude.
61
+ current_shaping_reward += -10 * np.abs(angle)
62
+
63
+ # E. Penalize high spin rate
64
+ current_shaping_reward += -10 * np.abs(angular_vel)
65
+
66
+ # F. Main Engine usage penalty near the ground
67
+ current_shaping_reward += -20 * main_engine_fired * proximity_factor
68
+
69
+ # G. Relaxed Height Penalty (Pressure for descent, but not panic)
70
+ current_shaping_reward += -10 * y_pos
71
+
72
+ # H. CRITICAL FIX: Catastrophic Post-Contact Thrust Penalty
73
+ # Increased to -2500 to ensure ANY post-contact thrust is worse than failing the episode.
74
+ # This forces the agent to choose "Action 0: Do Nothing" immediately upon touchdown.
75
+ contact_sum = left_leg_contact + right_leg_contact
76
+ if contact_sum > 0:
77
+ current_shaping_reward += -3000 * any_thruster_fired * contact_sum
78
+
79
+ # I. Aggressive Landing Leg Use Incentive
80
+ current_shaping_reward += 15 * contact_sum
81
+
82
+ # 3. Calculate the differential shaping reward (reward for improvement)
83
+ if self.last_shaping_reward is not None:
84
+ shaping_reward_diff = current_shaping_reward - self.last_shaping_reward
85
+ # Clip differential reward to prevent massive, unstable jumps
86
+ reward += np.clip(shaping_reward_diff, -10.0, 10.0)
87
+
88
+ self.last_shaping_reward = current_shaping_reward
89
+
90
+ # --- SUSTAINABLE TIME PENALTY (Now -0.05) ---
91
+ reward += TIME_PENALTY
92
+ # ---------------------------------------------
93
+
94
+ return observation, reward, terminated, truncated, info
95
+
96
+ def reset(self, **kwargs):
97
+ """Resets the environment and the shaping tracker."""
98
+ self.last_shaping_reward = None
99
+ return self.env.reset(**kwargs)
100
+
101
+ def render(self):
102
+ """Passes render call to the base environment."""
103
+ return self.env.render()
104
+
105
+ def close(self):
106
+ """Passes close call to the base environment."""
107
+ return self.env.close()
108
+
109
+ def __getattr__(self, name):
110
+ """Delegate attribute access (like observation_space) to the wrapped environment."""
111
+ return getattr(self.env, name)
112
+
113
+
trained_agent.py ADDED
@@ -0,0 +1,120 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gymnasium as gym
2
+ import numpy as np
3
+ import tensorflow as tf
4
+ import os
5
+ import time
6
+
7
+ # Assuming config, agent, and utils are in the same directory
8
+ from config import ENV_ID, SEED, SAVE_PATH, TOTAL_TIMESTEPS
9
+ from agent import PPOAgent
10
+ # --- CRITICAL NEW IMPORT ---
11
+ # Import the custom wrapper class used during training
12
+ from reward_shaping import LunarLanderRewardShaping
13
+ # ---------------------------
14
+
15
+ # Ensure eager execution is enabled (usually the default)
16
+ tf.config.run_functions_eagerly(True)
17
+
18
+ def run_trained_agent(episodes=10):
19
+ """
20
+ Loads the latest available trained PPO agent checkpoint and runs it for a specified number of episodes.
21
+ """
22
+ print(f"--- Running Trained Agent on {ENV_ID} with Human Rendering ---")
23
+
24
+ # 1. Environment Setup: Use a single, non-vectorized environment for clean inference.
25
+ try:
26
+ # Create the base environment
27
+ env = gym.make(ENV_ID, render_mode="human")
28
+
29
+ # --- CRITICAL FIX: Wrap the environment as done during training ---
30
+ # The agent was trained on this wrapper, so it must be evaluated on it.
31
+ env = LunarLanderRewardShaping(env)
32
+ # -----------------------------------------------------------------
33
+
34
+ except Exception as e:
35
+ print(f"ERROR: Could not create environment {ENV_ID} or apply wrapper. Details: {e}")
36
+ return
37
+
38
+ obs_shape = env.observation_space.shape
39
+
40
+ # --- FIX 2: Explicitly pass the integer size (.n) instead of the Discrete object ---
41
+ action_size = env.action_space.n
42
+ # ---------------------------------------------------------------------------------
43
+
44
+ # Reset the single environment
45
+ current_obs, info = env.reset(seed=SEED)
46
+
47
+ # 2. Agent Initialization
48
+ # Pass the integer size of the action space.
49
+ agent = PPOAgent(obs_shape, action_size, TOTAL_TIMESTEPS)
50
+
51
+ # 3. Load Latest Checkpoint
52
+ latest_checkpoint = agent.checkpoint_manager.latest_checkpoint
53
+
54
+ if not latest_checkpoint:
55
+ print("\nERROR: Could not find any checkpoint in the designated save path.")
56
+ env.close()
57
+ return
58
+ try:
59
+ # Restore everything tracked by the checkpoint object
60
+ agent.checkpoint.restore(latest_checkpoint).expect_partial()
61
+
62
+ # Manually sync the NumPy-based RMS object after restoration
63
+ agent.obs_rms.mean = agent.rms_mean_var.numpy()
64
+ agent.obs_rms.var = agent.rms_var_var.numpy()
65
+ agent.obs_rms.count = agent.rms_count_var.numpy()
66
+
67
+ loaded_timesteps = int(os.path.basename(latest_checkpoint).split('-')[-1])
68
+ print(f"\nSuccessfully loaded latest checkpoint trained to T={loaded_timesteps}")
69
+
70
+ except Exception as e:
71
+ print(f"\nERROR: Failed to restore checkpoint at {latest_checkpoint}. Details: {e}")
72
+ print("Suggestion: Check the consistency of your environment setup (wrapper, action size).")
73
+ env.close()
74
+ return
75
+
76
+ # 4. Run Episodes
77
+ print(f"\nStarting {episodes} playback episodes...")
78
+ total_rewards = []
79
+ for i in range(episodes):
80
+ done = False
81
+ episode_reward = 0
82
+ step_count = 0
83
+
84
+ while not done:
85
+ # Call env.render() inside the loop to refresh the visualization
86
+ env.render()
87
+
88
+ # current_obs is now the correct shape (8,). Reshape to (1, 8) for the agent model input.
89
+ obs_to_agent = current_obs.reshape(1, *obs_shape)
90
+
91
+ # Select action
92
+ actions, _, _ = agent.select_action(obs_to_agent)
93
+
94
+ # Step the environment with the single action
95
+ action_to_step = actions[0]
96
+ current_obs, reward, terminated, truncated, info = env.step(action_to_step)
97
+
98
+ done = terminated or truncated
99
+ episode_reward += reward
100
+ step_count += 1
101
+
102
+ # Simple visualization pause
103
+ time.sleep(0.01)
104
+
105
+ total_rewards.append(episode_reward)
106
+ print(f"Episode {i+1}: Reward = {episode_reward:7.2f}, Steps = {step_count}")
107
+
108
+ # Reset for the next episode
109
+ current_obs, info = env.reset()
110
+
111
+ # 5. Cleanup and Summary
112
+ env.close()
113
+
114
+ if total_rewards:
115
+ print("-" * 30)
116
+ print(f"Average Reward over {episodes} episodes: {np.mean(total_rewards):7.2f}")
117
+ print("-" * 30)
118
+
119
+ if __name__ == "__main__":
120
+ run_trained_agent(episodes=10)
utils.py ADDED
@@ -0,0 +1,102 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gymnasium as gym
2
+ from gymnasium.vector import AsyncVectorEnv
3
+ import numpy as np
4
+ import tensorflow as tf
5
+ import json
6
+ import os
7
+ import time
8
+
9
+ # --- NEW: Import the reward shaping wrapper ---
10
+ from reward_shaping import LunarLanderRewardShaping
11
+ # ---------------------------------------------
12
+ from config import * # Import constants like ENV_ID, N_STEPS, NUM_ENVS, GAMMA, RESUME_FILE
13
+
14
+ # --- Helper Functions for Checkpointing ---
15
+ def save_resume_data(filepath, timesteps, episodes):
16
+ """
17
+ Saves the current training state to a JSON file located at the specified filepath.
18
+ """
19
+ data = {
20
+ "timesteps": timesteps,
21
+ "episode_count": episodes,
22
+ "date": time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
23
+ }
24
+ try:
25
+ with open(filepath, 'w') as f:
26
+ json.dump(data, f)
27
+ except Exception as e:
28
+ print(f"Error saving resume data to {filepath}: {e}")
29
+
30
+ def load_resume_data(filepath):
31
+ """
32
+ Loads the last saved training state from the specified JSON file.
33
+ Returns (timesteps, episode_count).
34
+ """
35
+ if os.path.exists(filepath):
36
+ try:
37
+ with open(filepath, 'r') as f:
38
+ data = json.load(f)
39
+ timesteps = data.get("timesteps", 0)
40
+ episodes = data.get("episode_count", 0)
41
+ return (timesteps, episodes)
42
+ except Exception as e:
43
+ print(f"Error loading resume data from {filepath}: {e}. Starting from scratch.")
44
+ return (0, 0)
45
+
46
+ # --- Environment Setup for Parallel Execution ---
47
+ def make_env(env_id, seed, idx, **kwargs):
48
+ """
49
+ Creates a single environment instance with a unique seed and applies necessary wrappers.
50
+ """
51
+ def thunk():
52
+ # 1. Create the base environment
53
+ env = gym.make(env_id, **kwargs)
54
+
55
+ # 2. Apply Custom Wrapper (LunarLanderRewardShaping)
56
+ # This is essential for LunarLander to learn a dense reward structure
57
+ env = LunarLanderRewardShaping(env)
58
+
59
+ # 3. Apply Standard Logging Wrapper (CRITICAL FIX FOR LOGGING)
60
+ # This wrapper tracks the episode reward ('r') and length ('l')
61
+ # and puts them into the 'infos' dictionary when the episode is done.
62
+ env = gym.wrappers.RecordEpisodeStatistics(env)
63
+
64
+ # 4. Apply seeding to the final wrapped environment
65
+ env.action_space.seed(seed + idx)
66
+ env.observation_space.seed(seed + idx)
67
+
68
+ # The result 'env' is the fully wrapped, final environment instance
69
+ return env
70
+ return thunk
71
+
72
+ def make_parallel_envs(env_id, num_envs, seed):
73
+ """
74
+ Creates multiple environments and wraps them in an AsyncVectorEnv.
75
+ """
76
+ env_fns = [make_env(env_id, seed, i) for i in range(num_envs)]
77
+ return AsyncVectorEnv(env_fns)
78
+
79
+ # --- GAE Calculation ---
80
+ def calculate_gae(rewards, values, terminated, truncated, next_value, gamma=GAMMA, gae_lambda=GAE_LAMBDA):
81
+ """
82
+ Calculates Generalized Advantage Estimation (GAE) and Returns (R) from rollout data.
83
+ """
84
+ advantages = np.zeros_like(rewards, dtype=np.float32)
85
+ last_gae_lambda = 0
86
+
87
+ for t in reversed(range(N_STEPS)):
88
+ done_mask = 1.0 - (terminated[t] | truncated[t]).astype(np.float32)
89
+ if t == N_STEPS - 1:
90
+ next_non_terminal_value = next_value
91
+ next_value_actual = values[t]
92
+ else:
93
+ next_non_terminal_value = values[t + 1]
94
+ next_value_actual = values[t]
95
+
96
+ delta = rewards[t] + gamma * next_non_terminal_value * done_mask - next_value_actual
97
+
98
+ advantages[t] = delta + gamma * gae_lambda * done_mask * last_gae_lambda
99
+ last_gae_lambda = advantages[t]
100
+
101
+ returns = advantages + values
102
+ return advantages, returns