"""
train.py — PPO training script for SmartGridEnv

Fixes vs. original:
    - check_env() validates the environment before training starts
    - VecNormalize auto-normalizes observations and rewards for stable gradients
    - 500,000 timesteps (was 10,000 — far too few for PPO to learn anything)
    - EvalCallback saves the best model checkpoint automatically
    - Hyperparameters tuned for this problem (n_steps, batch_size, ent_coef)
    - vec_normalize stats saved alongside model (required for correct inference)
    - TensorBoard logging enabled (optional — run: tensorboard --logdir ./tb_logs)
"""

import os
from stable_baselines3 import PPO
from stable_baselines3.common.env_util import make_vec_env
from stable_baselines3.common.vec_env import VecNormalize
from stable_baselines3.common.callbacks import EvalCallback, CheckpointCallback
from stable_baselines3.common.env_checker import check_env

from smart_grid_env import SmartGridEnv

# ── 1. Validate environment ───────────────────────────────────────────────────
print("Checking environment...")
check_env(SmartGridEnv(), warn=True)
print("Environment check passed.\n")

# ── 2. Vectorised training environment (4 parallel workers) ───────────────────
N_ENVS = 4
train_env = make_vec_env(SmartGridEnv, n_envs=N_ENVS)
train_env = VecNormalize(
    train_env,
    norm_obs=True,      # normalizes each obs dimension to ~N(0,1)
    norm_reward=True,   # normalizes reward scale — critical for PPO stability
    clip_obs=10.0,
)

# ── 3. Separate evaluation environment (no reward normalisation) ───────────────
eval_env = make_vec_env(SmartGridEnv, n_envs=1)
eval_env = VecNormalize(
    eval_env,
    norm_obs=True,
    norm_reward=False,  # raw rewards for interpretable eval metrics
    training=False,     # stats are copied from train_env, not updated
    clip_obs=10.0,
)

# ── 4. Define the PPO model ────────────────────────────────────────────────────
model = PPO(
    policy          = "MlpPolicy",
    env             = train_env,
    verbose         = 1,
    tensorboard_log = "./tb_logs",
    # --- Core PPO hyperparameters ---
    n_steps         = 1024,     # steps collected per env per rollout
    batch_size      = 256,      # minibatch size for gradient update
    n_epochs        = 10,       # number of passes over each rollout buffer
    gamma           = 0.99,     # discount factor (long-horizon cost matters)
    gae_lambda      = 0.95,     # GAE smoothing
    clip_range      = 0.2,      # PPO clip parameter
    learning_rate   = 3e-4,     # Adam lr
    ent_coef        = 0.01,     # entropy bonus (encourages exploration early on)
    vf_coef         = 0.5,
    max_grad_norm   = 0.5,
    # --- Policy network architecture ---
    policy_kwargs   = dict(net_arch=[128, 128]),  # 2-layer MLP, 128 units each
)

# ── 5. Callbacks ───────────────────────────────────────────────────────────────
os.makedirs("./best_model",   exist_ok=True)
os.makedirs("./checkpoints",  exist_ok=True)

eval_callback = EvalCallback(
    eval_env,
    best_model_save_path = "./best_model",
    log_path             = "./eval_logs",
    eval_freq            = max(5_000 // N_ENVS, 1),  # evaluate every ~5k env steps
    n_eval_episodes      = 20,      # average over 20 full 24-hour episodes
    deterministic        = True,
    render               = False,
)

checkpoint_callback = CheckpointCallback(
    save_freq  = max(50_000 // N_ENVS, 1),
    save_path  = "./checkpoints",
    name_prefix= "ppo_smart_grid",
)

# ── 6. Train ───────────────────────────────────────────────────────────────────
TOTAL_TIMESTEPS = 500_000
print(f"Training PPO for {TOTAL_TIMESTEPS:,} timesteps across {N_ENVS} parallel envs...")
print("Tip: run `tensorboard --logdir ./tb_logs` to monitor training live.\n")

model.learn(
    total_timesteps = TOTAL_TIMESTEPS,
    callback        = [eval_callback, checkpoint_callback],
    progress_bar    = True,
)

# ── 7. Save final model + normalisation statistics ────────────────────────────
model.save("ppo_smart_grid")
train_env.save("vec_normalize.pkl")   # MUST be saved — needed for inference

print("\nTraining complete!")
print("  Saved: ppo_smart_grid.zip")
print("  Saved: vec_normalize.pkl  (required alongside the model for inference)")
print("  Best checkpoint: ./best_model/best_model.zip")

# ── 8. Quick sanity-check: run one episode with the trained agent ──────────────
print("\n--- Sanity check: one 24-hour episode ---")
from stable_baselines3.common.vec_env import DummyVecEnv

test_env = DummyVecEnv([SmartGridEnv])
test_env = VecNormalize.load("vec_normalize.pkl", test_env)
test_env.training = False
test_env.norm_reward = False

obs = test_env.reset()
total_cost = 0.0
for hour in range(24):
    action, _ = model.predict(obs, deterministic=True)
    obs, reward, done, info = test_env.step(action)
    total_cost += info[0]["cost"]
    action_label = ["Hold", "Charge", "Discharge"][int(action[0])]
    print(
        f"  Hour {hour+1:02d} | Action: {action_label:<10} | "
        f"Battery: {info[0]['battery_soc']:5.1f}% | "
        f"Price: ${info[0]['price']:.3f} | "
        f"Step cost: ${info[0]['cost']:.3f}"
    )

print(f"\nTotal 24-hour cost: ${total_cost:.2f}")
test_env.close()