Spaces:

chinmay0805
/

Smart-Grid-System

Running

App Files Files Community

Smart-Grid-System / train.py

chinmay0805

inital commit

06f9287 about 1 month ago

raw

history blame contribute delete

5.88 kB

	"""
	train.py — PPO training script for SmartGridEnv

	Fixes vs. original:
	- check_env() validates the environment before training starts
	- VecNormalize auto-normalizes observations and rewards for stable gradients
	- 500,000 timesteps (was 10,000 — far too few for PPO to learn anything)
	- EvalCallback saves the best model checkpoint automatically
	- Hyperparameters tuned for this problem (n_steps, batch_size, ent_coef)
	- vec_normalize stats saved alongside model (required for correct inference)
	- TensorBoard logging enabled (optional — run: tensorboard --logdir ./tb_logs)
	"""

	import os
	from stable_baselines3 import PPO
	from stable_baselines3.common.env_util import make_vec_env
	from stable_baselines3.common.vec_env import VecNormalize
	from stable_baselines3.common.callbacks import EvalCallback, CheckpointCallback
	from stable_baselines3.common.env_checker import check_env

	from smart_grid_env import SmartGridEnv

	# ── 1. Validate environment ───────────────────────────────────────────────────
	print("Checking environment...")
	check_env(SmartGridEnv(), warn=True)
	print("Environment check passed.\n")

	# ── 2. Vectorised training environment (4 parallel workers) ───────────────────
	N_ENVS = 4
	train_env = make_vec_env(SmartGridEnv, n_envs=N_ENVS)
	train_env = VecNormalize(
	train_env,
	norm_obs=True, # normalizes each obs dimension to ~N(0,1)
	norm_reward=True, # normalizes reward scale — critical for PPO stability
	clip_obs=10.0,
	)

	# ── 3. Separate evaluation environment (no reward normalisation) ───────────────
	eval_env = make_vec_env(SmartGridEnv, n_envs=1)
	eval_env = VecNormalize(
	eval_env,
	norm_obs=True,
	norm_reward=False, # raw rewards for interpretable eval metrics
	training=False, # stats are copied from train_env, not updated
	clip_obs=10.0,
	)

	# ── 4. Define the PPO model ────────────────────────────────────────────────────
	model = PPO(
	policy = "MlpPolicy",
	env = train_env,
	verbose = 1,
	tensorboard_log = "./tb_logs",
	# --- Core PPO hyperparameters ---
	n_steps = 1024, # steps collected per env per rollout
	batch_size = 256, # minibatch size for gradient update
	n_epochs = 10, # number of passes over each rollout buffer
	gamma = 0.99, # discount factor (long-horizon cost matters)
	gae_lambda = 0.95, # GAE smoothing
	clip_range = 0.2, # PPO clip parameter
	learning_rate = 3e-4, # Adam lr
	ent_coef = 0.01, # entropy bonus (encourages exploration early on)
	vf_coef = 0.5,
	max_grad_norm = 0.5,
	# --- Policy network architecture ---
	policy_kwargs = dict(net_arch=[128, 128]), # 2-layer MLP, 128 units each
	)

	# ── 5. Callbacks ───────────────────────────────────────────────────────────────
	os.makedirs("./best_model", exist_ok=True)
	os.makedirs("./checkpoints", exist_ok=True)

	eval_callback = EvalCallback(
	eval_env,
	best_model_save_path = "./best_model",
	log_path = "./eval_logs",
	eval_freq = max(5_000 // N_ENVS, 1), # evaluate every ~5k env steps
	n_eval_episodes = 20, # average over 20 full 24-hour episodes
	deterministic = True,
	render = False,
	)

	checkpoint_callback = CheckpointCallback(
	save_freq = max(50_000 // N_ENVS, 1),
	save_path = "./checkpoints",
	name_prefix= "ppo_smart_grid",
	)

	# ── 6. Train ───────────────────────────────────────────────────────────────────
	TOTAL_TIMESTEPS = 500_000
	print(f"Training PPO for {TOTAL_TIMESTEPS:,} timesteps across {N_ENVS} parallel envs...")
	print("Tip: run `tensorboard --logdir ./tb_logs` to monitor training live.\n")

	model.learn(
	total_timesteps = TOTAL_TIMESTEPS,
	callback = [eval_callback, checkpoint_callback],
	progress_bar = True,
	)

	# ── 7. Save final model + normalisation statistics ────────────────────────────
	model.save("ppo_smart_grid")
	train_env.save("vec_normalize.pkl") # MUST be saved — needed for inference

	print("\nTraining complete!")
	print(" Saved: ppo_smart_grid.zip")
	print(" Saved: vec_normalize.pkl (required alongside the model for inference)")
	print(" Best checkpoint: ./best_model/best_model.zip")

	# ── 8. Quick sanity-check: run one episode with the trained agent ──────────────
	print("\n--- Sanity check: one 24-hour episode ---")
	from stable_baselines3.common.vec_env import DummyVecEnv

	test_env = DummyVecEnv([SmartGridEnv])
	test_env = VecNormalize.load("vec_normalize.pkl", test_env)
	test_env.training = False
	test_env.norm_reward = False

	obs = test_env.reset()
	total_cost = 0.0
	for hour in range(24):
	action, _ = model.predict(obs, deterministic=True)
	obs, reward, done, info = test_env.step(action)
	total_cost += info[0]["cost"]
	action_label = ["Hold", "Charge", "Discharge"][int(action[0])]
	print(
	f" Hour {hour+1:02d} \| Action: {action_label:<10} \| "
	f"Battery: {info[0]['battery_soc']:5.1f}% \| "
	f"Price: ${info[0]['price']:.3f} \| "
	f"Step cost: ${info[0]['cost']:.3f}"
	)

	print(f"\nTotal 24-hour cost: ${total_cost:.2f}")
	test_env.close()