SolarSys / Other_algorithms /Flat_System /mappo /mappo_train.py

SolarSys2025

Upload 30 files

55da406 verified about 1 month ago

17.5 kB

	import os
	import sys
	import re # ← add thist
	import numpy as np
	import torch
	import matplotlib.pyplot as plt
	import pandas as pd
	import time
	from datetime import datetime

	sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "..")))

	from solar_sharer_battery_env import SolarSharer
	from mappo.trainer.mappo import MAPPO

	def main():

	STATE_TO_RUN = "pennsylvania" # "pennsylvania" or "colorado" or "oklahoma"

	# --- Set the path to your training data ---
	DATA_FILE_PATH = "/Users/ananygupta/Desktop/Final_revision/Australia_data/processed_data_ausgrid_100_houses.csv"
	num_episodes = 10000
	# total # of episodes you want to run
	batch_size = 256 # e.g. 512, 1024, 2048
	checkpoint_interval = 100000
	window_size = 32 # ← group episodes in blocks of 30


	env = SolarSharer(
	data_path=DATA_FILE_PATH,
	state=STATE_TO_RUN,
	time_freq="30T"
	)
	############################################################################################
	# ─── Sanity check: env I/O shapes ─────────────────────────────────────
	print("Observation space:", env.observation_space)
	print("Action space :", env.action_space)

	# Reset and inspect obs
	obs = env.reset()
	print(f"Reset returned {len(obs)} agent observations; each obs shape: {np.array(obs).shape}")

	# Sample random actions and do one step
	dummy_actions = np.random.rand(env.num_agents, env.action_space.shape[1]).astype(np.float32)
	next_obs, rewards, done, info = env.step(dummy_actions)
	print(f"Step outputs → next_obs: {len(next_obs)}×{np.array(next_obs).shape[1]}, "
	f"rewards: {len(rewards)}, done: {done}")
	print("Info keys:", list(info.keys()))
	# ────────────────────────────────────────────────────────────────

	# Count the number of houses in each group
	env.group_counts = {
	0: env.agent_groups.count(0),
	1: env.agent_groups.count(1)
	}
	print(f"Number of houses in each group: {env.group_counts}")

	max_steps = env.num_steps

	# dims from the env
	num_agents = env.num_agents
	local_state_dim = env.observation_space.shape[1]
	action_dim = env.action_space.shape[1]

	# ─── Build a unique run directory ───────────────────────────
	timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
	run_name = f"mappo_{STATE_TO_RUN}_{num_agents}agents_{num_episodes}eps_{timestamp}"
	root_dir = os.path.join("Testing_with_australia_data", run_name)
	os.makedirs(root_dir, exist_ok=True)
	print(f"Saving training outputs to: {root_dir}")

	logs_dir = os.path.join(root_dir, "logs")
	plots_dir = os.path.join(root_dir, "plots")
	os.makedirs(logs_dir, exist_ok=True)
	os.makedirs(plots_dir, exist_ok=True)


	# Create the MeanField agent
	mappo = MAPPO(
	n_agents=num_agents,
	local_dim=local_state_dim,
	global_dim=num_agents * local_state_dim,
	act_dim=action_dim,
	lr=2e-4,
	gamma=0.95,
	lam=0.95,
	clip_eps=0.2,
	k_epochs=4,
	batch_size=batch_size
	)


	# ─────────────── Tracking / Logging Variables ───────────────
	episode_rewards = [] # mean reward per episode (averaged across agents)
	episode_total_rewards = [] # total reward per episode (sum across agents)
	block_mean_rewards = [] # mean of mean-episode-rewards for each block of window_size
	block_total_rewards = [] # sum of total-episode-rewards for each block of window_size

	agent_rewards_log = [[] for _ in range(num_agents)]
	best_mean_reward = -1e9
	best_model_path = os.path.join(logs_dir, "best_model.pth")


	daily_rewards = [] # alias for episode_rewards
	monthly_rewards = [] # just kept in case you want the old logic

	training_start_time = time.time()
	episode_durations = []
	total_steps_global = 0
	episode_log_data = []
	# ADD THIS LINE to store the new metrics from the environment
	performance_metrics_log = [] # This will hold the detailed performance data for each episode.


	agent_charge_log = [[] for _ in range(num_agents)] # Track charge actions
	agent_discharge_log = [[] for _ in range(num_agents)] # Track discharge actions


	# ──────────── Training Loop ────────────
	for episode in range(1, num_episodes + 1):
	episode_start_time = time.time()

	obs = np.array(env.reset(), dtype=np.float32)


	# ADD THIS BLOCK to collect metrics from the previous episode
	# =================================================================
	# The env.reset() call above finalized the metrics for the episode that just finished.
	# We retrieve them here. We check `if episode > 1` because there are no
	# metrics to collect before the first episode has run.
	if episode > 1:
	# Call the getter method you added to the environment
	last_episode_metrics = env.get_episode_metrics()

	# Add the corresponding episode number for merging later
	last_episode_metrics['Episode'] = episode - 1

	# Append the dictionary of metrics to our new log
	performance_metrics_log.append(last_episode_metrics)
	# =================================================================

	total_reward = np.zeros(num_agents, dtype=np.float32)
	done = False
	step_count = 0
	day_logs = []
	episode_charges = [[] for _ in range(num_agents)]
	episode_discharges = [[] for _ in range(num_agents)]

	while not done:

	# flatten the joint state once per step
	# build global state and pick actions
	# obs is already a NumPy array of shape (num_agents, local_dim)
	global_obs = obs.flatten()
	actions, logps = mappo.select_action(obs, global_obs)

	# step environment
	next_obs_list, rewards, done, info = env.step(actions)

	# convert next observations to NumPy array too
	next_obs = np.array(next_obs_list, dtype=np.float32)
	next_global_obs = next_obs.flatten()


	# store transition
	# ensure fast conversion to torch.Tensor
	local_obs_arr = np.array(obs, dtype=np.float32)

	mappo.store(
	local_obs_arr,
	global_obs,
	actions,
	logps,
	rewards,
	done,
	next_global_obs
	)
	total_reward += rewards
	obs = next_obs
	step_count += 1
	total_steps_global += 1

	day_logs.append({
	"step": step_count - 1,
	"grid_import_no_p2p": info["grid_import_no_p2p"],
	"grid_import_with_p2p": info["grid_import_with_p2p"],
	"p2p_buy": info["p2p_buy"],
	"p2p_sell": info["p2p_sell"],
	"costs": info["costs"], # Capture costs for analysis
	"charge_amount": info.get("charge_amount", np.zeros(num_agents)), # New
	"discharge_amount": info.get("discharge_amount", np.zeros(num_agents)) # New
	})

	if step_count >= max_steps:
	break

	# ─── After each episode ───
	# 1) Compute per-episode metrics
	sum_ep_reward = float(np.sum(total_reward)) # total reward across all agents for this episode
	mean_ep_reward = float(np.mean(total_reward)) # mean reward across agents for this episode

	episode_total_rewards.append(sum_ep_reward)
	episode_rewards.append(mean_ep_reward)
	daily_rewards.append(mean_ep_reward)

	# 2) If we just finished a block of window_size episodes, aggregate
	if len(daily_rewards) % window_size == 0:
	# Sum of total rewards over the last window_size episodes
	last_totals = episode_total_rewards[-window_size:]
	block_sum = sum(last_totals)
	block_total_rewards.append(block_sum)

	# Mean of mean-episode-rewards over the last window_size episodes
	last_means = daily_rewards[-window_size:]
	block_mean = sum(last_means) / window_size
	block_mean_rewards.append(block_mean)

	block_idx = len(block_mean_rewards)
	print(
	f"→ Completed Block {block_idx} "
	f"\| Episodes { (block_idx-1)window_size + 1 }–{ block_idxwindow_size } "
	f"\| Block Total Reward: {block_sum:.3f} "
	f"\| Block Mean Reward: {block_mean:.3f}"
	)

	# 3) Log agent-level rewards
	for i in range(num_agents):
	agent_rewards_log[i].append(total_reward[i])
	episode_charges[i].append(actions[i][4])
	episode_discharges[i].append(actions[i][5])

	# 4) Summarize P2P steps (unchanged from your original code)
	steps_data = []
	for entry in day_logs:
	step_idx = entry["step"]
	p2p_buy_array = entry["p2p_buy"]
	p2p_sell_array = entry["p2p_sell"]
	grid_no_p2p_array = entry["grid_import_no_p2p"]
	grid_with_p2p_array = entry["grid_import_with_p2p"]

	steps_data.append({
	"step": step_idx,
	"p2p_buy_sum": float(np.sum(p2p_buy_array)),
	"p2p_sell_sum": float(np.sum(p2p_sell_array)),
	"grid_import_no_p2p_sum": float(np.sum(grid_no_p2p_array)),
	"grid_import_with_p2p_sum": float(np.sum(grid_with_p2p_array))
	})


	baseline_cost = np.sum([np.sum(entry["grid_import_no_p2p"]) * env.get_grid_price(entry["step"])
	for entry in day_logs])
	actual_cost = np.sum([np.sum(entry["costs"]) for entry in day_logs])
	cost_reduction = (baseline_cost - actual_cost) / baseline_cost

	# at end of episode
	mappo.update() # Update the MAPPO agent


	# save if best
	if mean_ep_reward > best_mean_reward:
	best_mean_reward = mean_ep_reward
	mappo.save(best_model_path)

	if episode % checkpoint_interval == 0:
	ckpt_path = os.path.join(logs_dir, f"checkpoint_{episode}.pth")
	mappo.save(ckpt_path)
	# CORRECTED TIMING AND LOGGING
	episode_end_time = time.time()
	episode_duration = episode_end_time - episode_start_time

	# Move the print statement here
	print(
	f"Episode {episode}/{num_episodes} "
	f"\| Time per Episode: {episode_duration:.2f}s "
	f"\| Steps: {step_count} "
	f"\| Mean Reward: {mean_ep_reward:.3f} "
	f"\| Cost Reduction: {cost_reduction:.2%}"
	)

	# Record data in our per-episode log
	episode_log_data.append({
	"Episode": episode,
	"Steps": step_count,
	"Mean_Reward": mean_ep_reward,
	"Total_Reward": sum_ep_reward,
	"Cost_Reduction_Pct": cost_reduction * 100, # New
	"Baseline_Cost": baseline_cost, # New
	"Actual_Cost": actual_cost, # New
	"Episode_Duration": episode_duration,
	"Total_Charge": np.sum([np.sum(entry["charge_amount"]) for entry in day_logs]), # New
	"Total_Discharge": np.sum([np.sum(entry["discharge_amount"]) for entry in day_logs]) # New
	})
	for i in range(num_agents):
	agent_charge_log[i].append(np.mean(episode_charges[i]))
	agent_discharge_log[i].append(np.mean(episode_discharges[i]))

	# ADD THIS BLOCK TO CAPTURE THE FINAL EPISODE'S METRICS
	# =================================================================
	# After the loop, the metrics for the final episode (num_episodes) are ready.
	# We collect them here to ensure the log is complete.
	final_episode_metrics = env.get_episode_metrics()
	final_episode_metrics['Episode'] = num_episodes
	performance_metrics_log.append(final_episode_metrics)
	# =================================================================



	# ─── End of all training ───
	training_end_time = time.time()
	total_training_time = training_end_time - training_start_time

	# Save out per-episode agent rewards + mean rewards
	np.save(os.path.join(logs_dir, "agent_rewards.npy"), np.array(agent_rewards_log))
	np.save(os.path.join(logs_dir, "mean_rewards.npy"), np.array(episode_rewards))
	np.save(os.path.join(logs_dir, "total_rewards.npy"), np.array(episode_total_rewards))

	################################# PLOTTING & LOGGING ##################################################################
	# ─────────── Create Final DataFrame for Logging and Plotting ───────────

	# 1. Create a DataFrame from the original log data (rewards, costs, etc.)
	df_rewards_log = pd.DataFrame(episode_log_data)

	# 2. Create a DataFrame from the new performance metrics log
	df_perf_log = pd.DataFrame(performance_metrics_log)

	# 3. Merge the two DataFrames on the 'Episode' column.
	# This combines all metrics into a single table.
	df_final_log = pd.merge(df_rewards_log, df_perf_log.drop(columns=[
	'degradation_cost_over_time',
	'cost_savings_over_time',
	'grid_reduction_over_time'
	]), on="Episode")


	# ─────────── PLOTTING ───────────

	# Ensure plot directory exists
	os.makedirs(plots_dir, exist_ok=True)

	# Helper: centered moving average
	def moving_avg(series, window):
	return pd.Series(series).rolling(window=window, center=True, min_periods=1).mean().to_numpy()

	# Smoothing window (in episodes)
	ma_window = 300
	episodes = np.arange(1, num_episodes + 1)

	# 1. Mean Reward moving average
	reward_ma = moving_avg(df_final_log["Mean_Reward"], ma_window)
	plt.figure(figsize=(8,5))
	plt.plot(episodes, reward_ma, linewidth=2, label=f"Mean Reward MA (win={ma_window})")
	plt.xlabel("Episode")
	plt.ylabel("Mean Reward")
	plt.title("MAPPO: Mean Reward Moving Average")
	plt.legend()
	plt.grid(True)
	plt.savefig(os.path.join(plots_dir, "mean_reward_ma.png"), dpi=200)
	plt.close()

	# 2. Total Reward moving average
	total_ma = moving_avg(df_final_log["Total_Reward"], ma_window)
	plt.figure(figsize=(8,5))
	plt.plot(episodes, total_ma, linewidth=2, label=f"Total Reward MA (win={ma_window})")
	plt.xlabel("Episode")
	plt.ylabel("Total Reward")
	plt.title("MAPPO: Total Reward Moving Average")
	plt.legend()
	plt.grid(True)
	plt.savefig(os.path.join(plots_dir, "total_reward_ma.png"), dpi=200)
	plt.close()

	# 3. Cost Reduction (%) moving average
	cost_ma = moving_avg(df_final_log["Cost_Reduction_Pct"], ma_window)
	plt.figure(figsize=(8,5))
	plt.plot(episodes, cost_ma, linewidth=2, label="Cost Reduction MA (%)")
	plt.xlabel("Episode")
	plt.ylabel("Cost Reduction (%)")
	plt.title("MAPPO: Cost Reduction Moving Average")
	plt.legend()
	plt.grid(True)
	plt.savefig(os.path.join(plots_dir, "cost_reduction_ma.png"), dpi=200)
	plt.close()

	# 4. Battery Degradation Cost moving average
	degradation_ma = moving_avg(df_final_log["battery_degradation_cost_total"], ma_window)
	plt.figure(figsize=(8,5))
	plt.plot(episodes, degradation_ma, linewidth=2, label=f"Degradation Cost MA (win={ma_window})", color='purple')
	plt.xlabel("Episode")
	plt.ylabel("Total Degradation Cost ($)")
	plt.title("MAPPO: Battery Degradation Cost Moving Average")
	plt.legend()
	plt.grid(True)
	plt.savefig(os.path.join(plots_dir, "degradation_cost_ma.png"), dpi=200)
	plt.close()


	# Final confirmation message
	print(f"\nAll moving-average plots saved to: {plots_dir}")


	# ─── Save Final Logs to CSV ───

	# 1. Add the total training time as a new row to the DataFrame
	total_time_row = pd.DataFrame([{
	"Episode": "Total_Training_Time",
	"Episode_Duration": total_training_time
	}])
	df_to_save = pd.concat([df_final_log, total_time_row], ignore_index=True)


	# 2. Define the path for the final CSV file.
	log_csv_path = os.path.join(logs_dir, "training_performance_log.csv")

	# 3. Select and reorder columns for the final CSV
	columns_to_save = [
	"Episode",
	"Mean_Reward",
	"Total_Reward",
	"Cost_Reduction_Pct",
	"Episode_Duration",
	"battery_degradation_cost_total",
	]
	df_to_save = df_to_save[columns_to_save]


	# 4. Save the comprehensive DataFrame to CSV.
	df_to_save.to_csv(log_csv_path, index=False)

	print(f"Saved comprehensive training performance log to: {log_csv_path}")

	# ─── Final Timings Printout ───
	print("\n" + "="*50)
	print("TRAINING COMPLETE".center(50))
	print(f"Total training time: {total_training_time:.2f} seconds")
	print("="*50)


	if __name__ == "__main__":
	main()