Upload 8 files

Browse files

Files changed (8) hide show

Dumb_Agent.py +40 -0
Environment_Constants.py +23 -0
PPO_Model.py +272 -0
PPO_Trainer.py +231 -0
Snake_EnvAndAgent.py +312 -0
Trained_PPO_Agent.py +110 -0
plot_utility_Trained_Agent.py +84 -0
plot_utility_Trainer.py +84 -0

Dumb_Agent.py ADDED Viewed

	@@ -0,0 +1,40 @@

+import gymnasium as gym
+from Snake_EnvAndAgent import SnakeGameEnv
+import pygame
+import time
+if __name__ == "__main__":
+    env = SnakeGameEnv(render_mode='human')
+    episodes = 5
+    for episode in range(episodes):
+        obs, info = env.reset()
+        done = False
+        total_reward = 0
+        steps = 0
+        print(f"--- Starting Episode {episode + 1} ---")
+        while not done:
+            # For manual testing
+            # keys = pygame.key.get_pressed()
+            # if keys[pygame.K_UP]: action = 0 (map to straight)
+            action = env.action_space.sample()
+            next_obs, reward, terminated, truncated, info = env.step(action)
+            total_reward += reward
+            steps += 1
+            done = terminated or truncated
+            # Render the environment
+            #env.render()
+            #time.sleep(100) # Small delay to see the game progression
+            obs = next_obs
+        print(f"Episode {episode + 1} finished in {steps} steps with total reward: {total_reward:.2f}")
+        print(f"Final Score: {info['score']}")
+    env.close()
+    print("Environment test finished.")

Environment_Constants.py ADDED Viewed

	@@ -0,0 +1,23 @@

+GRID_SIZE = 30
+CELL_SIZE = 30
+SCREEN_WIDTH = GRID_SIZE * CELL_SIZE
+SCREEN_HEIGHT = GRID_SIZE * CELL_SIZE
+WHITE = (255, 255, 255)
+BLACK = (0, 0, 0)
+GREEN = (0, 255, 0)
+RED = (255, 0, 0)
+BLUE = (0, 0, 255)
+UP = (0, -1)
+DOWN = (0, 1)
+LEFT = (-1, 0)
+RIGHT = (1, 0)
+FPS = 10
+REWARD_FOOD = 60
+REWARD_COLLISION = -60
+REWARD_STEP = -0.1
+OBSERVATION_SPACE_SIZE = 11

PPO_Model.py ADDED Viewed

	@@ -0,0 +1,272 @@

+import tensorflow as tf
+import keras
+from keras import layers, Model
+import numpy as np
+import tensorflow_probability as tfp
+import os
+import traceback
+tfd = tfp.distributions
+@tf.keras.utils.register_keras_serializable()
+class Actor(Model):
+    def __init__(self, obs_shape, action_size, hidden_layer_sizes=[512, 512, 512], **kwargs):
+        super().__init__(**kwargs)
+        if len(obs_shape) > 1:
+            self.flatten = layers.Flatten(input_shape=obs_shape)
+            self.flatten(tf.zeros((1,) + obs_shape))
+        else:
+            self.flatten = None
+        self.dense_layers = []
+        for size in hidden_layer_sizes:
+            self.dense_layers.append(layers.Dense(size, activation='relu'))
+        self.logits = layers.Dense(action_size)
+        self._obs_shape = obs_shape
+        self._action_size = action_size
+        self._hidden_layer_sizes = hidden_layer_sizes
+    def call(self, inputs):
+        x = self.flatten(inputs) if self.flatten else inputs
+        for layer in self.dense_layers:
+            x = layer(x)
+        return self.logits(x)
+    def get_config(self):
+        config = super().get_config()
+        config.update({
+            'obs_shape': self._obs_shape,
+            'action_size': self._action_size,
+            'hidden_layer_sizes': self._hidden_layer_sizes
+        })
+        return config
+@tf.keras.utils.register_keras_serializable()
+class Critic(Model):
+    def __init__(self, obs_shape, hidden_layer_sizes=[512, 512, 512], **kwargs):
+        super().__init__(**kwargs)
+        if len(obs_shape) > 1:
+            self.flatten = layers.Flatten(input_shape=obs_shape)
+            self.flatten(tf.zeros((1,) + obs_shape))
+        else:
+            self.flatten = None
+        self.dense_layers = []
+        for size in hidden_layer_sizes:
+            self.dense_layers.append(layers.Dense(size, activation='relu'))
+        self.value = layers.Dense(1)
+        self._obs_shape = obs_shape
+        self._hidden_layer_sizes = hidden_layer_sizes
+    def call(self, inputs):
+        x = self.flatten(inputs) if self.flatten else inputs
+        for layer in self.dense_layers:
+            x = layer(x)
+        return self.value(x)
+    def get_config(self):
+        config = super().get_config()
+        config.update({
+            'obs_shape': self._obs_shape,
+            'hidden_layer_sizes': self._hidden_layer_sizes
+        })
+        return config
+class PPOAgent:
+    def __init__(self, observation_space_shape, action_space_size,
+                 actor_lr=3e-4, critic_lr=3e-4, gamma=0.99,
+                 gae_lambda=0.95, clip_epsilon=0.2,
+                 num_epochs_per_update=10, batch_size=64,
+                 hidden_layer_sizes=[512, 512, 512]):
+        self.gamma = gamma
+        self.gae_lambda = gae_lambda
+        self.clip_epsilon = clip_epsilon
+        self.num_epochs_per_update = num_epochs_per_update
+        self.batch_size = batch_size
+        self.observation_space_shape = observation_space_shape
+        self.action_space_size = action_space_size
+        self.actor = Actor(observation_space_shape, action_space_size, hidden_layer_sizes=hidden_layer_sizes)
+        self.critic = Critic(observation_space_shape, hidden_layer_sizes=hidden_layer_sizes)
+        self.actor_optimizer = tf.keras.optimizers.Adam(learning_rate=actor_lr)
+        self.critic_optimizer = tf.keras.optimizers.Adam(learning_rate=critic_lr)
+        self.states = []
+        self.actions = []
+        self.rewards = []
+        self.next_states = []
+        self.dones = []
+        self.log_probs = []
+        self.values = []
+        self.action_masks = []
+        dummy_obs = tf.zeros((1,) + observation_space_shape, dtype=tf.float32)
+        self.actor(dummy_obs)
+        self.critic(dummy_obs)
+    def remember(self, state, action, reward, next_state, done, log_prob, value, action_mask):
+        self.states.append(state)
+        self.actions.append(action)
+        self.rewards.append(reward)
+        self.next_states.append(next_state)
+        self.dones.append(done)
+        self.log_probs.append(log_prob)
+        self.values.append(value)
+        self.action_masks.append(action_mask)
+    @tf.function
+    def _choose_action_tf(self, observation, action_mask):
+        observation = tf.expand_dims(tf.convert_to_tensor(observation, dtype=tf.float32), 0)
+        pi_logits = self.actor(observation)
+        masked_logits = tf.where(action_mask, pi_logits, -1e9)
+        value = self.critic(observation)
+        distribution = tfd.Categorical(logits=masked_logits)
+        action = distribution.sample()
+        log_prob = distribution.log_prob(action)
+        return action, log_prob, value
+    def choose_action(self, observation, action_mask):
+        action_tensor, log_prob_tensor, value_tensor = self._choose_action_tf(observation, tf.constant(action_mask, dtype=tf.bool))
+        return action_tensor.numpy(), log_prob_tensor.numpy(), value_tensor.numpy()[0,0]
+    def calculate_advantages_and_returns(self):
+        rewards = np.array(self.rewards, dtype=np.float32)
+        values = np.array(self.values, dtype=np.float32)
+        dones = np.array(self.dones, dtype=np.float32)
+        last_next_state_value = self.critic(tf.expand_dims(tf.convert_to_tensor(self.next_states[-1], dtype=tf.float32), 0)).numpy()[0,0] if not dones[-1] else 0
+        next_values = np.append(values[1:], last_next_state_value)
+        advantages = []
+        returns = []
+        last_advantage = 0
+        for t in reversed(range(len(rewards))):
+            delta = rewards[t] + self.gamma * next_values[t] * (1 - dones[t]) - values[t]
+            advantage = delta + self.gae_lambda * self.gamma * (1 - dones[t]) * last_advantage
+            advantages.insert(0, advantage)
+            returns.insert(0, advantage + values[t])
+            last_advantage = advantage
+        return np.array(advantages, dtype=np.float32), np.array(returns, dtype=np.float32)
+    def learn(self):
+        if not self.states:
+            return
+        states = tf.convert_to_tensor(np.array(self.states), dtype=tf.float32)
+        actions = tf.convert_to_tensor(np.array(self.actions), dtype=tf.int32)
+        old_log_probs = tf.convert_to_tensor(np.array(self.log_probs), dtype=tf.float32)
+        action_masks = tf.convert_to_tensor(np.array(self.action_masks), dtype=tf.bool)
+        advantages, returns = self.calculate_advantages_and_returns()
+        advantages = (advantages - tf.reduce_mean(advantages)) / (tf.math.reduce_std(advantages) + 1e-8)
+        dataset = tf.data.Dataset.from_tensor_slices((states, actions, old_log_probs, advantages, returns, action_masks))
+        dataset = dataset.shuffle(buffer_size=len(self.states)).batch(self.batch_size)
+        for _ in range(self.num_epochs_per_update):
+            for batch_states, batch_actions, batch_old_log_probs, batch_advantages, batch_returns, batch_action_masks in dataset:
+                with tf.GradientTape() as tape:
+                    current_logits = self.actor(batch_states)
+                    masked_logits = tf.where(batch_action_masks, current_logits, -1e9)
+                    new_distribution = tfd.Categorical(logits=masked_logits)
+                    new_log_probs = new_distribution.log_prob(batch_actions)
+                    ratio = tf.exp(new_log_probs - batch_old_log_probs)
+                    surrogate1 = ratio * batch_advantages
+                    surrogate2 = tf.clip_by_value(ratio, 1 - self.clip_epsilon, 1 + self.clip_epsilon) * batch_advantages
+                    actor_loss = -tf.reduce_mean(tf.minimum(surrogate1, surrogate2))
+                actor_grads = tape.gradient(actor_loss, self.actor.trainable_variables)
+                self.actor_optimizer.apply_gradients(zip(actor_grads, self.actor.trainable_variables))
+                with tf.GradientTape() as tape:
+                    new_values = self.critic(batch_states)
+                    critic_loss = tf.reduce_mean(tf.square(new_values - batch_returns))
+                critic_grads = tape.gradient(critic_loss, self.critic.trainable_variables)
+                self.critic_optimizer.apply_gradients(zip(critic_grads, self.critic.trainable_variables))
+        self.states = []
+        self.actions = []
+        self.rewards = []
+        self.next_states = []
+        self.dones = []
+        self.log_probs = []
+        self.values = []
+        self.action_masks = []
+    def save_models(self, path):
+        actor_save_path = f"{path}_actor.keras"
+        critic_save_path = f"{path}_critic.keras"
+        print(f"\n--- Attempting to save models ---")
+        print(f"Target Actor path: {os.path.abspath(actor_save_path)}")
+        print(f"Target Critic path: {os.path.abspath(critic_save_path)}")
+        try:
+            self.actor.save(actor_save_path)
+            print(f"Actor model saved successfully to {os.path.abspath(actor_save_path)}")
+        except Exception as e:
+            print(f"ERROR: Failed to save Actor model to {os.path.abspath(actor_save_path)}")
+            print(f"Reason: {e}")
+            traceback.print_exc()
+        try:
+            self.critic.save(critic_save_path)
+            print(f"Critic model saved successfully to {os.path.abspath(critic_save_path)}")
+        except Exception as e:
+            print(f"ERROR: Failed to save Critic model to {os.path.abspath(critic_save_path)}")
+            print(f"Reason: {e}")
+            traceback.print_exc()
+        print(f"--- Models save process completed ---\n")
+    def load_models(self, path):
+        actor_load_path = f"{path}_actor.keras"
+        critic_load_path = f"{path}_critic.keras"
+        actor_loaded_ok = False
+        critic_loaded_ok = False
+        custom_objects = {
+            'Actor': Actor,
+            'Critic': Critic
+        }
+        try:
+            self.actor = tf.keras.models.load_model(actor_load_path, custom_objects=custom_objects)
+            actor_loaded_ok = True
+            print(f"Actor model loaded from: {os.path.abspath(actor_load_path)}")
+        except Exception as e:
+            print(f"ERROR: Failed to load Actor model from {os.path.abspath(actor_load_path)}")
+            print(f"Reason: {e}")
+            traceback.print_exc()
+        try:
+            self.critic = tf.keras.models.load_model(critic_load_path, custom_objects=custom_objects)
+            critic_loaded_ok = True
+            print(f"Critic model loaded from: {os.path.abspath(critic_load_path)}")
+        except Exception as e:
+            print(f"ERROR: Failed to load Critic model from {os.path.abspath(critic_load_path)}")
+            print(f"Reason: {e}")
+            traceback.print_exc()
+        if actor_loaded_ok and critic_loaded_ok:
+            print(f"All PPO models loaded successfully from '{path}'.")
+            return True
+        else:
+            print(f"Warning: One or both models failed to load. The agent will use untrained models.")
+            return False

PPO_Trainer.py ADDED Viewed

	@@ -0,0 +1,231 @@

+import gymnasium as gym
+from Snake_EnvAndAgent import SnakeGameEnv
+from PPO_Model import PPOAgent
+import numpy as np
+import time
+import os
+import json # For saving/loading training state
+from plot_utility_Trainer import plot_rewards, smooth_curve, init_live_plot, update_live_plot, save_live_plot_final
+HYPERPARAMETERS = {
+    'grid_size': 30, # This is used for environment initialization
+    'actor_lr': 0.0003,
+    'critic_lr': 0.0003,
+    'gamma': 0.99,
+    'gae_lambda': 0.95,
+    'clip_epsilon': 0.2,
+    'num_epochs_per_update': 10,
+    'batch_size': 64,
+    'num_steps_per_rollout': 2048, # Number of steps to collect before a learning update
+    'total_timesteps': 10_000_000, # Total environmental steps to train for
+    'hidden_layer_sizes': [512, 512, 512],
+    'save_interval_timesteps': 400000, # Save models every N total timesteps
+    'log_interval_episodes': 10, # Log training progress every N episodes
+    'render_training': False, # Set to True to see rendering during training (will slow down)
+    'render_fps_limit': 10,  # Limits render FPS, if 0, renders as fast as possible (can be too fast)
+    'plot_smoothing_factor': 0.9, # For smoothing the reward plot
+    'live_plot_interval_episodes': 100, # Update live plot every N episodes
+    'resume_training': True # Set to True to attempt to resume from latest checkpoint
+}
+# Directory for saving models and plots
+MODEL_SAVE_DIR = 'snake_ppo_models'
+PLOT_SAVE_DIR = 'snake_ppo_plots'
+TRAINING_STATE_FILE = os.path.join(MODEL_SAVE_DIR, 'training_state.json')
+os.makedirs(MODEL_SAVE_DIR, exist_ok=True)
+os.makedirs(PLOT_SAVE_DIR, exist_ok=True)
+print(f"Model save directory created/checked: {os.path.abspath(MODEL_SAVE_DIR)}")
+print(f"Plot save directory created/checked: {os.path.abspath(PLOT_SAVE_DIR)}")
+def save_training_state(total_timesteps_trained, episode_count, all_episode_rewards, plot_rewards_history):
+    state = {
+        'total_timesteps_trained': total_timesteps_trained,
+        'episode_count': episode_count,
+        'all_episode_rewards': all_episode_rewards,
+        'plot_rewards_history': plot_rewards_history
+    }
+    with open(TRAINING_STATE_FILE, 'w') as f:
+        json.dump(state, f)
+    print(f"Training state saved to {TRAINING_STATE_FILE}")
+def load_training_state():
+    if os.path.exists(TRAINING_STATE_FILE):
+        with open(TRAINING_STATE_FILE, 'r') as f:
+            state = json.load(f)
+        print(f"Training state loaded from {TRAINING_STATE_FILE}")
+        return state['total_timesteps_trained'], \
+               state['episode_count'], \
+               state['all_episode_rewards'], \
+               state['plot_rewards_history']
+    return 0, 0, [], []
+def train_agent():
+    print(f"Current working directory: {os.getcwd()}")
+    print("Initializing environment and agent...")
+    render_mode = 'human' if HYPERPARAMETERS['render_training'] else None
+    env = SnakeGameEnv(render_mode=render_mode)
+    if HYPERPARAMETERS['render_training'] and HYPERPARAMETERS['render_fps_limit'] > 0:
+        env.metadata["render_fps"] = HYPERPARAMETERS['render_fps_limit']
+    obs_shape = env.observation_space.shape
+    action_size = env.action_space.n
+    agent = PPOAgent(
+        observation_space_shape=obs_shape,
+        action_space_size=action_size,
+        actor_lr=HYPERPARAMETERS['actor_lr'],
+        critic_lr=HYPERPARAMETERS['critic_lr'],
+        gamma=HYPERPARAMETERS['gamma'],
+        gae_lambda=HYPERPARAMETERS['gae_lambda'],
+        clip_epsilon=HYPERPARAMETERS['clip_epsilon'],
+        num_epochs_per_update=HYPERPARAMETERS['num_epochs_per_update'],
+        batch_size=HYPERPARAMETERS['batch_size'],
+        hidden_layer_sizes=HYPERPARAMETERS['hidden_layer_sizes']
+    )
+    total_timesteps_trained = 0
+    episode_count = 0
+    all_episode_rewards = []
+    plot_rewards_history = []
+    last_saved_timesteps = 0
+    # --- Resume Training Logic ---
+    if HYPERPARAMETERS['resume_training']:
+        print("Attempting to resume training...")
+        latest_checkpoint = None
+        for f in os.listdir(MODEL_SAVE_DIR):
+            if f.endswith('_actor.keras'):
+                try:
+                    timestep_str = f.split('_')[-2]
+                    timestep = int(timestep_str)
+                    if latest_checkpoint is None or timestep > latest_checkpoint[0]:
+                        latest_checkpoint = (timestep, f.replace('_actor.keras', ''))
+                except ValueError:
+                    continue
+        if latest_checkpoint:
+            print(f"Found latest checkpoint: {latest_checkpoint[1]}")
+            if agent.load_models(latest_checkpoint[1]):
+                total_timesteps_trained, episode_count, all_episode_rewards, plot_rewards_history = load_training_state()
+                last_saved_timesteps = total_timesteps_trained
+                print(f"Resumed from Timestep: {total_timesteps_trained}, Episode: {episode_count}")
+            else:
+                print("Failed to load models. Starting new training run.")
+                HYPERPARAMETERS['resume_training'] = False
+        else:
+            print("No previous checkpoints found. Starting new training run.")
+            HYPERPARAMETERS['resume_training'] = False
+    print("Starting training loop...")
+    start_time = time.time()
+    fig, ax, line = init_live_plot(PLOT_SAVE_DIR, filename="live_ppo_training_progress.png")
+    if HYPERPARAMETERS['resume_training'] and len(plot_rewards_history) > 1:
+        episodes_for_plot = [i * HYPERPARAMETERS['log_interval_episodes'] for i in range(len(plot_rewards_history))]
+        smoothed_rewards = smooth_curve(plot_rewards_history, factor=HYPERPARAMETERS['plot_smoothing_factor'])
+        update_live_plot(fig, ax, line, episodes_for_plot, smoothed_rewards,
+                          current_timestep=total_timesteps_trained,
+                          total_timesteps=HYPERPARAMETERS['total_timesteps'])
+    while total_timesteps_trained < HYPERPARAMETERS['total_timesteps']:
+        current_rollout_steps = 0
+        while current_rollout_steps < HYPERPARAMETERS['num_steps_per_rollout'] and \
+              total_timesteps_trained + current_rollout_steps < HYPERPARAMETERS['total_timesteps']:
+            state, info = env.reset()
+            current_action_mask = info['action_mask']
+            done = False
+            current_episode_reward = 0
+            while not done and current_rollout_steps < HYPERPARAMETERS['num_steps_per_rollout'] and \
+                  total_timesteps_trained + current_rollout_steps < HYPERPARAMETERS['total_timesteps']:
+                action, log_prob, value = agent.choose_action(state, current_action_mask)
+                next_state, reward, terminated, truncated, info = env.step(action)
+                current_episode_reward += reward
+                next_action_mask = info['action_mask']
+                # --- NEW: PASS ACTION MASK TO AGENT'S REMEMBER METHOD ---
+                agent.remember(state, action, reward, next_state, terminated, log_prob, value, current_action_mask)
+                state = next_state
+                current_action_mask = next_action_mask
+                current_rollout_steps += 1
+                done = terminated or truncated
+                if done:
+                    episode_count += 1
+                    all_episode_rewards.append(current_episode_reward)
+                    if episode_count % HYPERPARAMETERS['log_interval_episodes'] == 0:
+                        avg_reward_last_n_episodes = np.mean(all_episode_rewards[-HYPERPARAMETERS['log_interval_episodes']:]).round(2)
+                        plot_rewards_history.append(avg_reward_last_n_episodes)
+                        elapsed_time = time.time() - start_time
+                        print(f"Timestep: {total_timesteps_trained + current_rollout_steps}/{HYPERPARAMETERS['total_timesteps']} | "
+                              f"Episode: {episode_count} | "
+                              f"Avg Reward (last {HYPERPARAMETERS['log_interval_episodes']}): {avg_reward_last_n_episodes} | "
+                              f"Total Score (this ep): {info['score']} | "
+                              f"Time: {elapsed_time:.2f}s")
+                    if episode_count % HYPERPARAMETERS['live_plot_interval_episodes'] == 0:
+                        if len(plot_rewards_history) > 1:
+                            episodes_for_plot = [i * HYPERPARAMETERS['log_interval_episodes'] for i in range(len(plot_rewards_history))]
+                            smoothed_rewards = smooth_curve(plot_rewards_history, factor=HYPERPARAMETERS['plot_smoothing_factor'])
+                            update_live_plot(fig, ax, line, episodes_for_plot, smoothed_rewards,
+                                              current_timestep=total_timesteps_trained + current_rollout_steps,
+                                              total_timesteps=HYPERPARAMETERS['total_timesteps'])
+                    if HYPERPARAMETERS['render_training'] and done:
+                        time.sleep(0.5)
+                    break
+        total_timesteps_trained += current_rollout_steps
+        if len(agent.states) > 0:
+            print(f"  --- Agent learning at Total Timestep {total_timesteps_trained} (collected {len(agent.states)} steps in rollout) ---")
+            agent.learn()
+        else:
+            print(f"  --- No data collected in current rollout, skipping learning ---")
+        if total_timesteps_trained >= HYPERPARAMETERS['save_interval_timesteps'] and \
+           (total_timesteps_trained // HYPERPARAMETERS['save_interval_timesteps']) > \
+           (last_saved_timesteps // HYPERPARAMETERS['save_interval_timesteps']):
+            save_path_timesteps = (total_timesteps_trained // HYPERPARAMETERS['save_interval_timesteps']) * HYPERPARAMETERS['save_interval_timesteps']
+            print(f"--- Triggering periodic save at calculated timestep: {save_path_timesteps} ---")
+            agent.save_models(os.path.join(MODEL_SAVE_DIR, f"ppo_snake_{save_path_timesteps}"))
+            save_training_state(total_timesteps_trained, episode_count, all_episode_rewards, plot_rewards_history)
+            last_saved_timesteps = save_path_timesteps
+    print("\nTraining finished!")
+    print(f"--- Triggering final save at total_timesteps: {total_timesteps_trained} ---")
+    agent.save_models(os.path.join(MODEL_SAVE_DIR, "ppo_snake_final"))
+    save_training_state(total_timesteps_trained, episode_count, all_episode_rewards, plot_rewards_history)
+    env.close()
+    print("Generating final performance plot...")
+    episodes_for_plot = [i * HYPERPARAMETERS['log_interval_episodes'] for i in range(len(plot_rewards_history))]
+    smoothed_rewards = smooth_curve(plot_rewards_history, factor=HYPERPARAMETERS['plot_smoothing_factor'])
+    update_live_plot(fig, ax, line, episodes_for_plot, smoothed_rewards,
+                      current_timestep=total_timesteps_trained,
+                      total_timesteps=HYPERPARAMETERS['total_timesteps'])
+    save_live_plot_final(fig, ax)
+    plot_rewards(smoothed_rewards, HYPERPARAMETERS['log_interval_episodes'], PLOT_SAVE_DIR, "ppo_training_progress_final.png", show_plot=False)
+if __name__ == "__main__":
+    train_agent()

Snake_EnvAndAgent.py ADDED Viewed

	@@ -0,0 +1,312 @@

+import gymnasium as gym
+from gymnasium import spaces
+import random
+import pygame
+import numpy as np
+import collections
+from collections import deque
+from Environment_Constants import (
+    GRID_SIZE, CELL_SIZE, SCREEN_WIDTH, SCREEN_HEIGHT,
+    WHITE, BLACK, GREEN, RED, BLUE,
+    UP, DOWN, LEFT, RIGHT,
+    REWARD_FOOD, REWARD_COLLISION, REWARD_STEP,
+    FPS, OBSERVATION_SPACE_SIZE
+)
+class SnakeGameEnv(gym.Env):
+    metadata = {'render_modes': ['human', 'rgb_array'], 'render_fps': FPS}
+    def __init__(self, render_mode=None):
+        super().__init__()
+        self.grid_size = GRID_SIZE
+        self.cell_size = CELL_SIZE
+        self.screen_width = SCREEN_WIDTH
+        self.screen_height = SCREEN_HEIGHT
+        self.action_space = spaces.Discrete(3)
+        self.observation_space = spaces.Box(low=0, high=1,
+                                            shape=(OBSERVATION_SPACE_SIZE,),
+                                            dtype=np.float32)
+        self.render_mode = render_mode
+        self.window = None
+        self.clock = None
+        self._init_game_state()
+    def _init_game_state(self):
+        self.snake = deque()
+        self.head = (self.grid_size // 2, self.grid_size // 2)
+        self.snake.append(self.head)
+        self.snake.append((self.head[0], self.head[1] + 1))
+        self.snake.append((self.head[0], self.head[1] + 2))
+        self.direction = UP
+        self.score = 0
+        self.food = self._place_food()
+        self.game_over = False
+        self.steps_since_food = 0
+        self.length = len(self.snake)
+    def _place_food(self):
+        while True:
+            x = random.randrange(self.grid_size)
+            y = random.randrange(self.grid_size)
+            food_pos = (x, y)
+            if food_pos not in self.snake:
+                return food_pos
+    def _is_position_safe_for_observation(self, pos):
+        px, py = pos
+        if not (0 <= px < self.grid_size and 0 <= py < self.grid_size):
+            return False
+        if pos in list(self.snake)[1:]:
+            return False
+        return True
+    def _get_observation(self):
+        obs = np.zeros(OBSERVATION_SPACE_SIZE, dtype=np.float32)
+        hx, hy = self.head
+        if self.direction == UP:
+            dir_straight = UP
+            dir_right = RIGHT
+            dir_left = LEFT
+        elif self.direction == DOWN:
+            dir_straight = DOWN
+            dir_right = LEFT
+            dir_left = RIGHT
+        elif self.direction == LEFT:
+            dir_straight = LEFT
+            dir_right = UP
+            dir_left = DOWN
+        elif self.direction == RIGHT:
+            dir_straight = RIGHT
+            dir_right = DOWN
+            dir_left = UP
+        check_pos_straight = (hx + dir_straight[0], hy + dir_straight[1])
+        check_pos_right = (hx + dir_right[0], hy + dir_right[1])
+        check_pos_left = (hx + dir_left[0], hy + dir_left[1])
+        obs[0] = 1 if not self._is_position_safe_for_observation(check_pos_straight) else 0
+        obs[1] = 1 if not self._is_position_safe_for_observation(check_pos_right) else 0
+        obs[2] = 1 if not self._is_position_safe_for_observation(check_pos_left) else 0
+        fx, fy = self.food
+        if fy < hy: obs[3] = 1
+        if fy > hy: obs[4] = 1
+        if fx < hx: obs[5] = 1
+        if fx > hx: obs[6] = 1
+        if self.direction == UP: obs[7] = 1
+        elif self.direction == DOWN: obs[8] = 1
+        elif self.direction == LEFT: obs[9] = 1
+        elif self.direction == RIGHT: obs[10] = 1
+        return obs
+    def _get_action_mask(self):
+        mask = np.array([True, True, True], dtype=bool)
+        hx, hy = self.head
+        potential_directions = [
+            self.direction,
+            None,
+            None
+        ]
+        if self.direction == UP:
+            potential_directions[1] = RIGHT
+            potential_directions[2] = LEFT
+        elif self.direction == DOWN:
+            potential_directions[1] = LEFT
+            potential_directions[2] = RIGHT
+        elif self.direction == LEFT:
+            potential_directions[1] = UP
+            potential_directions[2] = DOWN
+        elif self.direction == RIGHT:
+            potential_directions[1] = DOWN
+            potential_directions[2] = UP
+        def _is_potential_move_illegal(pos_to_check, current_snake, food_pos):
+            if not (0 <= pos_to_check[0] < self.grid_size and 0 <= pos_to_check[1] < self.grid_size):
+                return True
+            if pos_to_check in list(current_snake)[:-1]:
+                return True
+            if pos_to_check == current_snake[-1]:
+                if pos_to_check != food_pos:
+                    return True
+            return False
+        for action_idx, new_dir in enumerate(potential_directions):
+            dx, dy = new_dir
+            potential_head = (hx + dx, hy + dy)
+            if _is_potential_move_illegal(potential_head, self.snake, self.food):
+                mask[action_idx] = False
+        if not np.any(mask):
+            print(f"Warning: All actions masked out at head {self.head}, direction {self.direction}, food {self.food}. Attempting to find a fallback action.")
+            found_fallback = False
+            for i in range(3): # Check Straight, Right, Left
+                dx, dy = potential_directions[i]
+                potential_head = (hx + dx, hy + dy)
+                if not _is_potential_move_illegal(potential_head, self.snake, self.food):
+                    mask[i] = True
+                    found_fallback = True
+            if not found_fallback:
+                mask[np.random.choice(3)] = True
+                print("Critical Warning: No legal actions found even after fallback logic. Enabling a random action to prevent deadlock.")
+        return mask
+    def reset(self, seed=None, options=None):
+        super().reset(seed=seed)
+        self._init_game_state()
+        observation = self._get_observation()
+        info = self._get_info()
+        if not np.any(info['action_mask']):
+             print("Warning: No valid actions found in initial reset state.")
+        if self.render_mode == 'human':
+            self._render_frame()
+        return observation, info
+    def _get_info(self):
+        """Returns environment information, including the action mask."""
+        return {
+            "score": self.score,
+            "snake_length": len(self.snake),
+            "action_mask": self._get_action_mask()
+        }
+    def step(self, action):
+        new_direction = self.direction
+        if action == 1:
+            if self.direction == UP: new_direction = RIGHT
+            elif self.direction == DOWN: new_direction = LEFT
+            elif self.direction == LEFT: new_direction = UP
+            elif self.direction == RIGHT: new_direction = DOWN
+        elif action == 2:
+            if self.direction == UP: new_direction = LEFT
+            elif self.direction == DOWN: new_direction = RIGHT
+            elif self.direction == LEFT: new_direction = DOWN
+            elif self.direction == RIGHT: new_direction = UP
+        elif action != 0:
+            raise ValueError(f"Received invalid action={action} which is not part of the action space.")
+        self.direction = new_direction
+        hx, hy = self.head
+        dx, dy = self.direction
+        new_head = (hx + dx, hy + dy)
+        reward = REWARD_STEP
+        terminated = False
+        truncated = False
+        if not (0 <= new_head[0] < self.grid_size and 0 <= new_head[1] < self.grid_size):
+            terminated = True
+            reward = REWARD_COLLISION
+        elif new_head in list(self.snake)[:-1]:
+            terminated = True
+            reward = REWARD_COLLISION
+        elif new_head == self.snake[-1] and new_head != self.food:
+            terminated = True
+            reward = REWARD_COLLISION
+        if terminated:
+            self.game_over = True
+        else:
+            self.snake.appendleft(new_head)
+            self.head = new_head
+            if new_head == self.food:
+                self.score += 1
+                self.length += 1
+                reward = REWARD_FOOD
+                self.food = self._place_food()
+                self.steps_since_food = 0
+            else:
+                self.snake.pop()
+                self.steps_since_food += 1
+            if self.steps_since_food >= self.grid_size * self.grid_size * 1.5:
+                 terminated = True
+                 truncated = True
+                 reward = REWARD_COLLISION
+        observation = self._get_observation()
+        info = self._get_info()
+        if self.render_mode == 'human':
+            self._render_frame()
+        return observation, reward, terminated, truncated, info
+    def _render_frame(self):
+        if self.window is None and self.render_mode == 'human':
+            pygame.init()
+            pygame.display.init()
+            self.window = pygame.display.set_mode((self.screen_width, self.screen_height))
+            pygame.display.set_caption("Snake AI Training")
+        if self.clock is None and self.render_mode == 'human':
+            self.clock = pygame.time.Clock()
+        if self.render_mode == 'human':
+            self.window.fill(BLACK)
+            pygame.draw.rect(self.window, RED, (self.food[0] * self.cell_size,
+                                               self.food[1] * self.cell_size,
+                                               self.cell_size, self.cell_size))
+            for i, segment in enumerate(self.snake):
+                color = BLUE if i == 0 else GREEN
+                pygame.draw.rect(self.window, color, (segment[0] * self.cell_size,
+                                                      segment[1] * self.cell_size,
+                                                      self.cell_size, self.cell_size))
+            for x in range(0, self.screen_width, self.cell_size):
+                pygame.draw.line(self.window, WHITE, (x, 0), (x, self.screen_height))
+            for y in range(0, self.screen_height, self.cell_size):
+                pygame.draw.line(self.window, WHITE, (0, y), (self.screen_width, y))
+            font = pygame.font.Font(None, 25)
+            text = font.render(f"Score: {self.score}", True, WHITE)
+            self.window.blit(text, (5, 5))
+            pygame.event.pump()
+            pygame.display.flip()
+            self.clock.tick(self.metadata["render_fps"])
+        elif self.render_mode == "rgb_array":
+            surf = pygame.Surface((self.screen_width, self.screen_height))
+            surf.fill(BLACK)
+            pygame.draw.rect(surf, RED, (self.food[0] * self.cell_size,
+                                         self.food[1] * self.cell_size,
+                                         self.cell_size, self.cell_size))
+            for i, segment in enumerate(self.snake):
+                color = BLUE if i == 0 else GREEN
+                pygame.draw.rect(surf, color, (segment[0] * self.cell_size,
+                                               segment[1] * self.cell_size,
+                                               self.cell_size, self.cell_size))
+            return np.transpose(np.array(pygame.surfarray.pixels3d(surf)), axes=(1, 0, 2))
+    def close(self):
+        if self.window is not None:
+            pygame.display.quit()
+            pygame.quit()
+            self.window = None
+            self.clock = None

Trained_PPO_Agent.py ADDED Viewed

	@@ -0,0 +1,110 @@

+import gymnasium as gym
+from Snake_EnvAndAgent import SnakeGameEnv
+from PPO_Model import PPOAgent
+import os
+import time
+import numpy as np
+from plot_utility_Trained_Agent import init_live_plot, update_live_plot, save_live_plot_final, smooth_curve
+PLAY_CONFIG = {
+    'grid_size': 30,
+    'model_path_prefix': 'snake_ppo_models/ppo_snake_final',
+    'num_episodes_to_play': 100,
+    'render_fps': 10,
+    'live_plot_interval_episodes': 1,
+    'plot_smoothing_factor': 0.8
+}
+PLOT_SAVE_DIR = 'snake_ppo_plots'
+os.makedirs(PLOT_SAVE_DIR, exist_ok=True)
+def play_agent():
+    print("Initializing environment for playback...")
+    env = SnakeGameEnv(render_mode='human')
+    if PLAY_CONFIG['render_fps'] > 0:
+        env.metadata["render_fps"] = PLAY_CONFIG['render_fps']
+    obs_shape = env.observation_space.shape
+    action_size = env.action_space.n
+    agent = PPOAgent(
+        observation_space_shape=obs_shape,
+        action_space_size=action_size,
+        actor_lr=3e-4,
+        critic_lr=3e-4,
+        hidden_layer_sizes=[512, 512, 512]
+    )
+    print(f"loading models from: {PLAY_CONFIG['model_path_prefix']}")
+    load_success = agent.load_models(PLAY_CONFIG['model_path_prefix'])
+    if not load_success:
+        print("\nFATAL ERROR: Failed to load trained models from disk. The agent CANNOT perform as trained. Exiting playback!.")
+        env.close()
+        return
+    print("--- Trained models loaded successfully. Loading playback. ---")
+    print("Starting agent playback...")
+    episode_rewards_playback = []
+    fig, ax, line = init_live_plot(PLOT_SAVE_DIR, filename="live_playback_rewards_plot.png")
+    ax.set_title('Live Playback Progress (Episode Rewards)')
+    ax.set_xlabel('Episode')
+    ax.set_ylabel('Total Reward')
+    for episode in range(PLAY_CONFIG['num_episodes_to_play']):
+        state, info = env.reset()
+        current_action_mask = info['action_mask']
+        done = False
+        episode_reward = 0
+        steps = 0
+        while not done:
+            action, _, _ = agent.choose_action(state, current_action_mask)
+            next_state, reward, terminated, truncated, info = env.step(action)
+            episode_reward += reward
+            state = next_state
+            steps += 1
+            done = terminated or truncated
+            current_action_mask = info['action_mask']
+            if PLAY_CONFIG['render_fps'] > 0:
+                time.sleep(1 / env.metadata["render_fps"])
+        episode_rewards_playback.append(episode_reward)
+        print(f"Episode {episode + 1}: Total Reward = {episode_reward:.2f}, Score = {info['score']}, Steps = {steps}")
+        if (episode + 1) % PLAY_CONFIG['live_plot_interval_episodes'] == 0:
+            current_episodes = list(range(1, len(episode_rewards_playback) + 1))
+            smoothed_rewards = smooth_curve(episode_rewards_playback, factor=PLAY_CONFIG['plot_smoothing_factor'])
+            update_live_plot(fig, ax, line, current_episodes, smoothed_rewards,
+                             current_timestep=episode + 1,
+                             total_timesteps=PLAY_CONFIG['num_episodes_to_play'])
+        time.sleep(0.5)
+    env.close()
+    print("\nPlayback finished.")
+    current_episodes = list(range(1, len(episode_rewards_playback) + 1))
+    smoothed_rewards = smooth_curve(episode_rewards_playback, factor=PLAY_CONFIG['plot_smoothing_factor'])
+    update_live_plot(fig, ax, line, current_episodes, smoothed_rewards,
+                     current_timestep=PLAY_CONFIG['num_episodes_to_play'],
+                     total_timesteps=PLAY_CONFIG['num_episodes_to_play'])
+    save_live_plot_final(fig, ax)
+    avg_playback_reward = np.mean(episode_rewards_playback)
+    print(f"Average Reward over {PLAY_CONFIG['num_episodes_to_play']} playback episodes: {avg_playback_reward:.2f}")
+if __name__ == "__main__":
+    play_agent()

plot_utility_Trained_Agent.py ADDED Viewed

	@@ -0,0 +1,84 @@

+import matplotlib.pyplot as plt
+import numpy as np
+import os
+import time
+def smooth_curve(points, factor=0.9):
+    smoothed_points = []
+    if points:
+        smoothed_points.append(points[0])
+        for i in range(1, len(points)):
+            smoothed_points.append(smoothed_points[-1] * factor + points[i] * (1 - factor))
+    return smoothed_points
+def plot_rewards(rewards_history, log_interval, save_dir, filename="Trained_Agent_rewards_plot.png", show_plot=True):
+    os.makedirs(save_dir, exist_ok=True)
+    plt.figure(figsize=(12, 6))
+    episodes = [i * log_interval for i in range(1, len(rewards_history) + 1)]
+    plt.plot(episodes, rewards_history, label='Average Reward')
+    plt.xlabel('Episodes')
+    plt.ylabel('Average Reward')
+    plt.title('Trained Agent Live Reward (Average Reward per Episode)')
+    plt.grid(True)
+    plt.legend()
+    plt.tight_layout()
+    save_path = os.path.join(save_dir, filename)
+    plt.savefig(save_path)
+    print(f"Plot saved to: {os.path.abspath(save_path)}")
+    if show_plot:
+        plt.show()
+def init_live_plot(save_dir, filename="live_rewards_plot.png"):
+    plt.ion()
+    fig, ax = plt.subplots(figsize=(12, 6))
+    line, = ax.plot([], [], label='Smoothed Average Reward')
+    ax.set_xlabel('Episodes')
+    ax.set_ylabel('Average Reward')
+    ax.set_title('Live Reward for Trained Agent')
+    ax.grid(True)
+    ax.legend()
+    plt.tight_layout()
+    ax._save_path_final = os.path.join(save_dir, filename)
+    return fig, ax, line
+def update_live_plot(fig, ax, line, episodes, smoothed_rewards, current_timestep=None, total_timesteps=None):
+    if not episodes or not smoothed_rewards:
+        return
+    line.set_data(episodes, smoothed_rewards)
+    ax.set_xlim(0, max(episodes) * 1.05 if episodes else 1)
+    min_y = min(smoothed_rewards) * 0.9 if smoothed_rewards else -1
+    max_y = max(smoothed_rewards) * 1.1 if smoothed_rewards else 1
+    if abs(max_y - min_y) < 0.1:
+        min_y -= 0.05
+        max_y += 0.05
+    ax.set_ylim(min_y, max_y)
+    if current_timestep is not None and total_timesteps is not None:
+        ax.set_title(f'Live Agent Progress (Timestep: {current_timestep:,}/{total_timesteps:,})')
+    fig.canvas.draw()
+    fig.canvas.flush_events()
+    time.sleep(0.01)
+def save_live_plot_final(fig, ax):
+    plt.ioff()
+    save_path = getattr(ax, '_save_path_final', None)
+    if save_path:
+        plt.savefig(save_path)
+        print(f"Final live plot saved to: {os.path.abspath(save_path)}")
+    plt.close(fig)
+    plt.show()

plot_utility_Trainer.py ADDED Viewed

	@@ -0,0 +1,84 @@

+import matplotlib.pyplot as plt
+import numpy as np
+import os
+import time
+def smooth_curve(points, factor=0.9):
+    smoothed_points = []
+    if points:
+        smoothed_points.append(points[0])
+        for i in range(1, len(points)):
+            smoothed_points.append(smoothed_points[-1] * factor + points[i] * (1 - factor))
+    return smoothed_points
+def plot_rewards(rewards_history, log_interval, save_dir, filename="rewards_plot.png", show_plot=True):
+    os.makedirs(save_dir, exist_ok=True)
+    plt.figure(figsize=(12, 6))
+    episodes = [i * log_interval for i in range(1, len(rewards_history) + 1)]
+    plt.plot(episodes, rewards_history, label='Average Reward')
+    plt.xlabel('Episodes')
+    plt.ylabel('Average Reward')
+    plt.title('PPO Training Progress (Average Reward per Episode)')
+    plt.grid(True)
+    plt.legend()
+    plt.tight_layout()
+    save_path = os.path.join(save_dir, filename)
+    plt.savefig(save_path)
+    print(f"Plot saved to: {os.path.abspath(save_path)}")
+    if show_plot:
+        plt.show()
+def init_live_plot(save_dir, filename="live_rewards_plot.png"):
+    plt.ion()
+    fig, ax = plt.subplots(figsize=(12, 6))
+    line, = ax.plot([], [], label='Smoothed Average Reward')
+    ax.set_xlabel('Episodes')
+    ax.set_ylabel('Average Reward')
+    ax.set_title('Live PPO Training Progress')
+    ax.grid(True)
+    ax.legend()
+    plt.tight_layout()
+    ax._save_path_final = os.path.join(save_dir, filename)
+    return fig, ax, line
+def update_live_plot(fig, ax, line, episodes, smoothed_rewards, current_timestep=None, total_timesteps=None):
+    if not episodes or not smoothed_rewards:
+        return
+    line.set_data(episodes, smoothed_rewards)
+    ax.set_xlim(0, max(episodes) * 1.05 if episodes else 1)
+    min_y = min(smoothed_rewards) * 0.9 if smoothed_rewards else -1
+    max_y = max(smoothed_rewards) * 1.1 if smoothed_rewards else 1
+    if abs(max_y - min_y) < 0.1:
+        min_y -= 0.05
+        max_y += 0.05
+    ax.set_ylim(min_y, max_y)
+    if current_timestep is not None and total_timesteps is not None:
+        ax.set_title(f'Live PPO Training Progress (Timestep: {current_timestep:,}/{total_timesteps:,})')
+    fig.canvas.draw()
+    fig.canvas.flush_events()
+    time.sleep(0.01)
+def save_live_plot_final(fig, ax):
+    plt.ioff()
+    save_path = getattr(ax, '_save_path_final', None)
+    if save_path:
+        plt.savefig(save_path)
+        print(f"Final live plot saved to: {os.path.abspath(save_path)}")
+    plt.close(fig)
+    plt.show()