import tensorflow as tf
from keras.layers import Dense, Normalization, Input
from keras.models import Model
import tensorflow_probability as tfp
import numpy as np
import os
from config import *

class RunningMeanStd:
    def __init__(self, shape):
        self.mean = np.zeros(shape, dtype=np.float32)
        self.var = np.ones(shape, dtype=np.float32)
        self.count = 1e-4

    def update(self, x):
        batch_mean = np.mean(x, axis=0)
        batch_var = np.var(x, axis=0)
        batch_count = x.shape[0]
        
        self.update_from_moments(batch_mean, batch_var, batch_count)

    def update_from_moments(self, batch_mean, batch_var, batch_count):
        delta = batch_mean - self.mean
        total_count = self.count + batch_count
        
        new_mean = self.mean + delta * batch_count / total_count
        
        m_a = self.var * self.count
        m_b = batch_var * batch_count
        m2 = m_a + m_b + np.square(delta) * self.count * batch_count / total_count
        
        new_var = m2 / total_count
        
        self.mean = new_mean
        self.var = new_var
        self.count = total_count

class PPOAgent:
    def __init__(self, obs_shape, action_size, total_timesteps):
        self.obs_shape = obs_shape
        self.action_size = action_size
        
        self.policy = self._build_policy_model(obs_shape, action_size)
        self.value = self._build_value_model(obs_shape)
        
        self.lr_schedule = tf.keras.optimizers.schedules.PolynomialDecay(
            initial_learning_rate=LEARNING_RATE,
            decay_steps=total_timesteps * PPO_EPOCHS / (N_STEPS * NUM_ENVS),
            end_learning_rate=0.0
        )
        self.optimizer = tf.keras.optimizers.Adam(learning_rate=self.lr_schedule, epsilon=1e-5)
        
        self.obs_rms = RunningMeanStd(shape=obs_shape)
        
        self.rms_mean_var = tf.Variable(self.obs_rms.mean, dtype=tf.float32, name="obs_rms_mean")
        self.rms_var_var = tf.Variable(self.obs_rms.var, dtype=tf.float32, name="obs_rms_var")
        self.rms_count_var = tf.Variable(self.obs_rms.count, dtype=tf.float32, name="obs_rms_count")

        self.checkpoint = tf.train.Checkpoint(
            policy=self.policy, 
            value=self.value, 
            optimizer=self.optimizer,
            obs_rms_mean=self.rms_mean_var,
            obs_rms_var=self.rms_var_var,
            obs_rms_count=self.rms_count_var
        )
        self.checkpoint_manager = tf.train.CheckpointManager(
            self.checkpoint, os.path.join(SAVE_PATH, 'tf_checkpoints'), max_to_keep=1000
        )

    def _build_policy_model(self, obs_shape, action_size):
        inputs = tf.keras.Input(shape=obs_shape)
        x = Dense(64, activation='relu')(inputs)
        x = Dense(64, activation='relu')(x)
        logits = Dense(action_size)(x) 
        
        return tf.keras.Model(inputs=inputs, outputs=logits, name="policy_model")

    def _build_value_model(self, obs_shape):
        inputs = tf.keras.Input(shape=obs_shape)
        x = Dense(64, activation='relu')(inputs)
        x = Dense(64, activation='relu')(x)
        value = Dense(1)(x)
        return tf.keras.Model(inputs=inputs, outputs=value, name="value_model")

    def adapt_normalization(self, initial_observations):
        """Updates the observation normalizer using initial data and saves state to checkpoint variables."""
        self.obs_rms.update(initial_observations)
        # Update checkpoint variables for persistence
        self.rms_mean_var.assign(self.obs_rms.mean)
        self.rms_var_var.assign(self.obs_rms.var)
        self.rms_count_var.assign(self.obs_rms.count)

    def normalize_obs(self, obs):
        """
        Applies normalization. This runs in Eager mode (NumPy input) or Graph mode (Tensor input).
        The source of RMS parameters is selected based on the input type.
        """
        is_tensor = tf.is_tensor(obs)
        
        rms_mean = self.rms_mean_var if is_tensor else self.obs_rms.mean
        rms_var = self.rms_var_var if is_tensor else self.obs_rms.var
        
        if not is_tensor:
            obs = obs.astype(np.float32)
        
        normalized_obs = (obs - rms_mean) / tf.sqrt(rms_var + 1e-8)
        
        normalized_obs = tf.clip_by_value(normalized_obs, -10.0, 10.0)
        
        if not is_tensor:
            return normalized_obs.numpy()
        
        return normalized_obs


    def select_action(self, obs):
        """Selects action, computes value, and log_prob for the given observation in Eager Mode."""
        
        obs_tensor = tf.convert_to_tensor(obs, dtype=tf.float32)
        
        normalized_obs = self.normalize_obs(obs_tensor)
        
        logits = self.policy(normalized_obs, training=False)
        values = self.value(normalized_obs, training=False)
        
        distribution = tfp.distributions.Categorical(logits=logits)
        actions = distribution.sample()
        
        log_probs = distribution.log_prob(actions)
        
       
        actions_np = actions.numpy().astype(np.int64) 
        values_np = values.numpy().flatten()
        log_probs_np = log_probs.numpy()
        
        return actions_np, values_np, log_probs_np

    @tf.function
    def learn_step(self, obs, actions, old_log_probs, returns, advantages, old_values):
        """Performs a single PPO optimization step."""
        with tf.GradientTape() as tape:
            
            logits = self.policy(obs, training=True)
            values = self.value(obs, training=True)
            
            distribution = tfp.distributions.Categorical(logits=logits)
            log_probs = distribution.log_prob(actions)
            ratio = tf.exp(log_probs - old_log_probs)
            
            values_clipped = old_values + tf.clip_by_value(values - old_values, -CLIP_RANGE, CLIP_RANGE)
            value_loss1 = tf.square(returns - values)
            value_loss2 = tf.square(returns - values_clipped)
            value_loss = 0.5 * tf.reduce_mean(tf.maximum(value_loss1, value_loss2))
            
            pg_loss1 = -advantages * ratio
            pg_loss2 = -advantages * tf.clip_by_value(ratio, 1.0 - CLIP_RANGE, 1.0 + CLIP_RANGE)
            policy_loss = tf.reduce_mean(tf.maximum(pg_loss1, pg_loss2))
            
            entropy = tf.reduce_mean(distribution.entropy())
            
            total_loss = policy_loss + VALUE_COEF * value_loss - ENTROPY_COEF * entropy
        
        grads = tape.gradient(total_loss, self.policy.trainable_variables + self.value.trainable_variables)
        grads, _ = tf.clip_by_global_norm(grads, MAX_GRAD_NORM)
        self.optimizer.apply_gradients(zip(grads, self.policy.trainable_variables + self.value.trainable_variables))
        
        return -policy_loss, value_loss, entropy

    def learn(self, ppo_batch):
        """PPO update loop with data preparation and mini-batching."""
        
        self.obs_rms.update(ppo_batch['observations'])
        self.rms_mean_var.assign(self.obs_rms.mean)
        self.rms_var_var.assign(self.obs_rms.var)
        self.rms_count_var.assign(self.obs_rms.count)
        
        obs = self.normalize_obs(ppo_batch['observations'])

        actions = ppo_batch['actions']
        old_log_probs = ppo_batch['log_probs']
        returns = ppo_batch['returns']
        advantages = ppo_batch['advantages']
        old_values = ppo_batch['old_values']

        advantages = (advantages - np.mean(advantages)) / (np.std(advantages) + 1e-8)
        
        obs_tensor = tf.convert_to_tensor(obs, dtype=tf.float32)
        actions_tensor = tf.convert_to_tensor(actions, dtype=np.int64) 
        old_log_probs_tensor = tf.convert_to_tensor(old_log_probs, dtype=tf.float32)
        returns_tensor = tf.convert_to_tensor(returns.flatten(), dtype=tf.float32)
        advantages_tensor = tf.convert_to_tensor(advantages.flatten(), dtype=tf.float32)
        old_values_tensor = tf.convert_to_tensor(old_values.flatten(), dtype=tf.float32)

        batch_size = obs_tensor.shape[0]
        minibatch_size = batch_size // NUM_MINIBATCHES
        
        policy_losses, value_losses, entropies = [], [], []

        for epoch in range(PPO_EPOCHS):

            indices = tf.range(batch_size)
            shuffled_indices = tf.random.shuffle(indices)

            for start in range(0, batch_size, minibatch_size):
                end = start + minibatch_size
                minibatch_indices = shuffled_indices[start:end]
                
                mb_obs = tf.gather(obs_tensor, minibatch_indices)
                mb_actions = tf.gather(actions_tensor, minibatch_indices)
                mb_old_log_probs = tf.gather(old_log_probs_tensor, minibatch_indices)
                mb_returns = tf.gather(returns_tensor, minibatch_indices)
                mb_advantages = tf.gather(advantages_tensor, minibatch_indices)
                mb_old_values = tf.gather(old_values_tensor, minibatch_indices)
                
                p_loss, v_loss, entropy = self.learn_step(
                    mb_obs, mb_actions, mb_old_log_probs, mb_returns, mb_advantages, mb_old_values
                )
                policy_losses.append(p_loss.numpy())
                value_losses.append(v_loss.numpy())
                entropies.append(entropy.numpy())

        return np.mean(policy_losses), np.mean(value_losses), np.mean(entropies)

    def save_model(self, save_dir, timesteps):
        """Saves the full checkpoint using the manager."""
        self.checkpoint_manager.save(checkpoint_number=timesteps)
        
    def load_model(self, save_dir, timesteps):
        """Loads the last successful checkpoint."""
        latest_checkpoint = self.checkpoint_manager.latest_checkpoint
        
        if latest_checkpoint:
            print(f"Restoring checkpoint from {latest_checkpoint}...")

            self.checkpoint.restore(latest_checkpoint).expect_partial()
            

            self.obs_rms.mean = self.rms_mean_var.numpy()
            self.obs_rms.var = self.rms_var_var.numpy()
            self.obs_rms.count = self.rms_count_var.numpy()
            
            print("Model, Optimizer, and Normalizer restored successfully.")
        else:
            raise FileNotFoundError(f"No checkpoint found in {self.checkpoint_manager.directory}")