import gymnasium as gym import numpy as np import tensorflow as tf import os import config from utilities import init_gpu from agent import PPOAgent from trainer import PPOTrainer def main(): init_gpu() # Dynamic lookup to establish environment dimensions temp_env = gym.make(config.ENV_NAME) action_bounds = temp_env.action_space.high[0] temp_env.close() agent = PPOAgent(action_bounds) trainer = PPOTrainer(agent) summary_writer = tf.summary.create_file_writer(config.LOG_DIR) # Checkpoint Manager Implementation (Saves Optimizers + Weights) global_iteration = tf.Variable(1, dtype=tf.int64) checkpoint = tf.train.Checkpoint( actor_critic=agent.ac, actor_optimizer=agent.actor_opt, critic_optimizer=agent.critic_opt, iteration=global_iteration ) checkpoint_manager = tf.train.CheckpointManager( checkpoint, directory=config.CHECKPOINT_DIR, max_to_keep=1000 ) # Checkpoint Manager Implementation (Explicitly bind underlying layer variables) #global_iteration = tf.Variable(1, dtype=tf.int64) #checkpoint = tf.train.Checkpoint( #actor_dense1=agent.ac.actor_dense1, #actor_dense2=agent.ac.actor_dense2, #mu=agent.ac.mu, #log_std=agent.ac.log_std, #critic_dense1=agent.ac.critic_dense1, #critic_dense2=agent.ac.critic_dense2, #value=agent.ac.value, #actor_optimizer=agent.actor_opt, #critic_optimizer=agent.critic_opt, #iteration=global_iteration #) #checkpoint_manager = tf.train.CheckpointManager( #checkpoint, directory=config.CHECKPOINT_DIR, max_to_keep=3 #) # Automatically check for existing weights to resume execution if checkpoint_manager.latest_checkpoint: checkpoint.restore(checkpoint_manager.latest_checkpoint) print(f"⚔ Resuming pipeline safely from Checkpoint Iteration: {global_iteration.numpy()}") else: print("🌱 No active checkpoint located. Initializing new optimization cycle...") start_iter = global_iteration.numpy() best_score = -float('inf') # Track the highest score achieved for best-weight tracking for itr in range(start_iter, config.TOTAL_ITERATIONS + 1): global_iteration.assign(itr) # 1. Gather concurrent rollouts via multiprocessing states, actions, log_probs, returns, advantages, ep_scores = trainer.collect_rollouts() # 2. Perform optimization on the collected data actor_loss, critic_loss = trainer.train_epoch(states, actions, log_probs, returns, advantages) # 3. Log values to TensorBoard avg_score = np.mean(ep_scores) if ep_scores else -100.0 with summary_writer.as_default(): tf.summary.scalar("Loss/Actor_Loss", actor_loss, step=itr) tf.summary.scalar("Loss/Critic_Loss", critic_loss, step=itr) tf.summary.scalar("Parameters/Exploration_Log_Std", agent.ac.log_std.numpy()[0], step=itr) if ep_scores: tf.summary.scalar("Metrics/Mean_Reward_Raw", avg_score, step=itr) if itr % 5 == 0: print(f"Iteration: {itr:3d}/{config.TOTAL_ITERATIONS} | Mean Env Score: {avg_score:6.2f} | Variance: {agent.ac.log_std.numpy()[0]:.3f}") # Save periodic checkpoint defensively for power-loss protection checkpoint_manager.save() # šŸ† Dynamic Best Weight Tracking: Save deployment weights if performance improves #if avg_score > best_score and avg_score > 0.0: #best_score = avg_score #print(f"🌟 New performance milestone! Saving best weights with score: {best_score:.2f}") #agent.ac.save_weights("ppo_mountain_car_weights.h5") if avg_score > best_score and avg_score > 0.0: best_score = avg_score print(f"🌟 New performance milestone! Saving best weights with score: {best_score:.2f}") # CHANGE THIS LINE: agent.ac.save_weights("ppo_mountain_car_weights.weights.h5") print(f"\nšŸ Finished all {config.TOTAL_ITERATIONS} training iterations successfully.") trainer.close() if __name__ == "__main__": main()