SolarSys2025's picture
Upload 30 files
55da406 verified
import os
import sys
import re # ← add thist
import numpy as np
import torch
import matplotlib.pyplot as plt
import pandas as pd
import time
from datetime import datetime
sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "..")))
from solar_sharer_battery_env import SolarSharer
from mappo.trainer.mappo import MAPPO
def main():
STATE_TO_RUN = "pennsylvania" # "pennsylvania" or "colorado" or "oklahoma"
# --- Set the path to your training data ---
DATA_FILE_PATH = "/Users/ananygupta/Desktop/Final_revision/Australia_data/processed_data_ausgrid_100_houses.csv"
num_episodes = 10000
# total # of episodes you want to run
batch_size = 256 # e.g. 512, 1024, 2048
checkpoint_interval = 100000
window_size = 32 # ← group episodes in blocks of 30
env = SolarSharer(
data_path=DATA_FILE_PATH,
state=STATE_TO_RUN,
time_freq="30T"
)
############################################################################################
# ─── Sanity check: env I/O shapes ─────────────────────────────────────
print("Observation space:", env.observation_space)
print("Action space :", env.action_space)
# Reset and inspect obs
obs = env.reset()
print(f"Reset returned {len(obs)} agent observations; each obs shape: {np.array(obs).shape}")
# Sample random actions and do one step
dummy_actions = np.random.rand(env.num_agents, env.action_space.shape[1]).astype(np.float32)
next_obs, rewards, done, info = env.step(dummy_actions)
print(f"Step outputs β†’ next_obs: {len(next_obs)}Γ—{np.array(next_obs).shape[1]}, "
f"rewards: {len(rewards)}, done: {done}")
print("Info keys:", list(info.keys()))
# ────────────────────────────────────────────────────────────────
# Count the number of houses in each group
env.group_counts = {
0: env.agent_groups.count(0),
1: env.agent_groups.count(1)
}
print(f"Number of houses in each group: {env.group_counts}")
max_steps = env.num_steps
# dims from the env
num_agents = env.num_agents
local_state_dim = env.observation_space.shape[1]
action_dim = env.action_space.shape[1]
# ─── Build a unique run directory ───────────────────────────
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
run_name = f"mappo_{STATE_TO_RUN}_{num_agents}agents_{num_episodes}eps_{timestamp}"
root_dir = os.path.join("Testing_with_australia_data", run_name)
os.makedirs(root_dir, exist_ok=True)
print(f"Saving training outputs to: {root_dir}")
logs_dir = os.path.join(root_dir, "logs")
plots_dir = os.path.join(root_dir, "plots")
os.makedirs(logs_dir, exist_ok=True)
os.makedirs(plots_dir, exist_ok=True)
# Create the MeanField agent
mappo = MAPPO(
n_agents=num_agents,
local_dim=local_state_dim,
global_dim=num_agents * local_state_dim,
act_dim=action_dim,
lr=2e-4,
gamma=0.95,
lam=0.95,
clip_eps=0.2,
k_epochs=4,
batch_size=batch_size
)
# ─────────────── Tracking / Logging Variables ───────────────
episode_rewards = [] # mean reward per episode (averaged across agents)
episode_total_rewards = [] # total reward per episode (sum across agents)
block_mean_rewards = [] # mean of mean-episode-rewards for each block of window_size
block_total_rewards = [] # sum of total-episode-rewards for each block of window_size
agent_rewards_log = [[] for _ in range(num_agents)]
best_mean_reward = -1e9
best_model_path = os.path.join(logs_dir, "best_model.pth")
daily_rewards = [] # alias for episode_rewards
monthly_rewards = [] # just kept in case you want the old logic
training_start_time = time.time()
episode_durations = []
total_steps_global = 0
episode_log_data = []
# ADD THIS LINE to store the new metrics from the environment
performance_metrics_log = [] # This will hold the detailed performance data for each episode.
agent_charge_log = [[] for _ in range(num_agents)] # Track charge actions
agent_discharge_log = [[] for _ in range(num_agents)] # Track discharge actions
# ──────────── Training Loop ────────────
for episode in range(1, num_episodes + 1):
episode_start_time = time.time()
obs = np.array(env.reset(), dtype=np.float32)
# ADD THIS BLOCK to collect metrics from the *previous* episode
# =================================================================
# The env.reset() call above finalized the metrics for the episode that just finished.
# We retrieve them here. We check `if episode > 1` because there are no
# metrics to collect before the first episode has run.
if episode > 1:
# Call the getter method you added to the environment
last_episode_metrics = env.get_episode_metrics()
# Add the corresponding episode number for merging later
last_episode_metrics['Episode'] = episode - 1
# Append the dictionary of metrics to our new log
performance_metrics_log.append(last_episode_metrics)
# =================================================================
total_reward = np.zeros(num_agents, dtype=np.float32)
done = False
step_count = 0
day_logs = []
episode_charges = [[] for _ in range(num_agents)]
episode_discharges = [[] for _ in range(num_agents)]
while not done:
# flatten the joint state once per step
# build global state and pick actions
# obs is already a NumPy array of shape (num_agents, local_dim)
global_obs = obs.flatten()
actions, logps = mappo.select_action(obs, global_obs)
# step environment
next_obs_list, rewards, done, info = env.step(actions)
# convert next observations to NumPy array too
next_obs = np.array(next_obs_list, dtype=np.float32)
next_global_obs = next_obs.flatten()
# store transition
# ensure fast conversion to torch.Tensor
local_obs_arr = np.array(obs, dtype=np.float32)
mappo.store(
local_obs_arr,
global_obs,
actions,
logps,
rewards,
done,
next_global_obs
)
total_reward += rewards
obs = next_obs
step_count += 1
total_steps_global += 1
day_logs.append({
"step": step_count - 1,
"grid_import_no_p2p": info["grid_import_no_p2p"],
"grid_import_with_p2p": info["grid_import_with_p2p"],
"p2p_buy": info["p2p_buy"],
"p2p_sell": info["p2p_sell"],
"costs": info["costs"], # Capture costs for analysis
"charge_amount": info.get("charge_amount", np.zeros(num_agents)), # New
"discharge_amount": info.get("discharge_amount", np.zeros(num_agents)) # New
})
if step_count >= max_steps:
break
# ─── After each episode ───
# 1) Compute per-episode metrics
sum_ep_reward = float(np.sum(total_reward)) # total reward across all agents for this episode
mean_ep_reward = float(np.mean(total_reward)) # mean reward across agents for this episode
episode_total_rewards.append(sum_ep_reward)
episode_rewards.append(mean_ep_reward)
daily_rewards.append(mean_ep_reward)
# 2) If we just finished a block of window_size episodes, aggregate
if len(daily_rewards) % window_size == 0:
# Sum of total rewards over the last window_size episodes
last_totals = episode_total_rewards[-window_size:]
block_sum = sum(last_totals)
block_total_rewards.append(block_sum)
# Mean of mean-episode-rewards over the last window_size episodes
last_means = daily_rewards[-window_size:]
block_mean = sum(last_means) / window_size
block_mean_rewards.append(block_mean)
block_idx = len(block_mean_rewards)
print(
f"β†’ Completed Block {block_idx} "
f"| Episodes { (block_idx-1)*window_size + 1 }–{ block_idx*window_size } "
f"| Block Total Reward: {block_sum:.3f} "
f"| Block Mean Reward: {block_mean:.3f}"
)
# 3) Log agent-level rewards
for i in range(num_agents):
agent_rewards_log[i].append(total_reward[i])
episode_charges[i].append(actions[i][4])
episode_discharges[i].append(actions[i][5])
# 4) Summarize P2P steps (unchanged from your original code)
steps_data = []
for entry in day_logs:
step_idx = entry["step"]
p2p_buy_array = entry["p2p_buy"]
p2p_sell_array = entry["p2p_sell"]
grid_no_p2p_array = entry["grid_import_no_p2p"]
grid_with_p2p_array = entry["grid_import_with_p2p"]
steps_data.append({
"step": step_idx,
"p2p_buy_sum": float(np.sum(p2p_buy_array)),
"p2p_sell_sum": float(np.sum(p2p_sell_array)),
"grid_import_no_p2p_sum": float(np.sum(grid_no_p2p_array)),
"grid_import_with_p2p_sum": float(np.sum(grid_with_p2p_array))
})
baseline_cost = np.sum([np.sum(entry["grid_import_no_p2p"]) * env.get_grid_price(entry["step"])
for entry in day_logs])
actual_cost = np.sum([np.sum(entry["costs"]) for entry in day_logs])
cost_reduction = (baseline_cost - actual_cost) / baseline_cost
# at end of episode
mappo.update() # Update the MAPPO agent
# save if best
if mean_ep_reward > best_mean_reward:
best_mean_reward = mean_ep_reward
mappo.save(best_model_path)
if episode % checkpoint_interval == 0:
ckpt_path = os.path.join(logs_dir, f"checkpoint_{episode}.pth")
mappo.save(ckpt_path)
# CORRECTED TIMING AND LOGGING
episode_end_time = time.time()
episode_duration = episode_end_time - episode_start_time
# Move the print statement here
print(
f"Episode {episode}/{num_episodes} "
f"| Time per Episode: {episode_duration:.2f}s "
f"| Steps: {step_count} "
f"| Mean Reward: {mean_ep_reward:.3f} "
f"| Cost Reduction: {cost_reduction:.2%}"
)
# Record data in our per-episode log
episode_log_data.append({
"Episode": episode,
"Steps": step_count,
"Mean_Reward": mean_ep_reward,
"Total_Reward": sum_ep_reward,
"Cost_Reduction_Pct": cost_reduction * 100, # New
"Baseline_Cost": baseline_cost, # New
"Actual_Cost": actual_cost, # New
"Episode_Duration": episode_duration,
"Total_Charge": np.sum([np.sum(entry["charge_amount"]) for entry in day_logs]), # New
"Total_Discharge": np.sum([np.sum(entry["discharge_amount"]) for entry in day_logs]) # New
})
for i in range(num_agents):
agent_charge_log[i].append(np.mean(episode_charges[i]))
agent_discharge_log[i].append(np.mean(episode_discharges[i]))
# ADD THIS BLOCK TO CAPTURE THE FINAL EPISODE'S METRICS
# =================================================================
# After the loop, the metrics for the final episode (num_episodes) are ready.
# We collect them here to ensure the log is complete.
final_episode_metrics = env.get_episode_metrics()
final_episode_metrics['Episode'] = num_episodes
performance_metrics_log.append(final_episode_metrics)
# =================================================================
# ─── End of all training ───
training_end_time = time.time()
total_training_time = training_end_time - training_start_time
# Save out per-episode agent rewards + mean rewards
np.save(os.path.join(logs_dir, "agent_rewards.npy"), np.array(agent_rewards_log))
np.save(os.path.join(logs_dir, "mean_rewards.npy"), np.array(episode_rewards))
np.save(os.path.join(logs_dir, "total_rewards.npy"), np.array(episode_total_rewards))
################################# PLOTTING & LOGGING ##################################################################
# ─────────── Create Final DataFrame for Logging and Plotting ───────────
# 1. Create a DataFrame from the original log data (rewards, costs, etc.)
df_rewards_log = pd.DataFrame(episode_log_data)
# 2. Create a DataFrame from the new performance metrics log
df_perf_log = pd.DataFrame(performance_metrics_log)
# 3. Merge the two DataFrames on the 'Episode' column.
# This combines all metrics into a single table.
df_final_log = pd.merge(df_rewards_log, df_perf_log.drop(columns=[
'degradation_cost_over_time',
'cost_savings_over_time',
'grid_reduction_over_time'
]), on="Episode")
# ─────────── PLOTTING ───────────
# Ensure plot directory exists
os.makedirs(plots_dir, exist_ok=True)
# Helper: centered moving average
def moving_avg(series, window):
return pd.Series(series).rolling(window=window, center=True, min_periods=1).mean().to_numpy()
# Smoothing window (in episodes)
ma_window = 300
episodes = np.arange(1, num_episodes + 1)
# 1. Mean Reward moving average
reward_ma = moving_avg(df_final_log["Mean_Reward"], ma_window)
plt.figure(figsize=(8,5))
plt.plot(episodes, reward_ma, linewidth=2, label=f"Mean Reward MA (win={ma_window})")
plt.xlabel("Episode")
plt.ylabel("Mean Reward")
plt.title("MAPPO: Mean Reward Moving Average")
plt.legend()
plt.grid(True)
plt.savefig(os.path.join(plots_dir, "mean_reward_ma.png"), dpi=200)
plt.close()
# 2. Total Reward moving average
total_ma = moving_avg(df_final_log["Total_Reward"], ma_window)
plt.figure(figsize=(8,5))
plt.plot(episodes, total_ma, linewidth=2, label=f"Total Reward MA (win={ma_window})")
plt.xlabel("Episode")
plt.ylabel("Total Reward")
plt.title("MAPPO: Total Reward Moving Average")
plt.legend()
plt.grid(True)
plt.savefig(os.path.join(plots_dir, "total_reward_ma.png"), dpi=200)
plt.close()
# 3. Cost Reduction (%) moving average
cost_ma = moving_avg(df_final_log["Cost_Reduction_Pct"], ma_window)
plt.figure(figsize=(8,5))
plt.plot(episodes, cost_ma, linewidth=2, label="Cost Reduction MA (%)")
plt.xlabel("Episode")
plt.ylabel("Cost Reduction (%)")
plt.title("MAPPO: Cost Reduction Moving Average")
plt.legend()
plt.grid(True)
plt.savefig(os.path.join(plots_dir, "cost_reduction_ma.png"), dpi=200)
plt.close()
# 4. Battery Degradation Cost moving average
degradation_ma = moving_avg(df_final_log["battery_degradation_cost_total"], ma_window)
plt.figure(figsize=(8,5))
plt.plot(episodes, degradation_ma, linewidth=2, label=f"Degradation Cost MA (win={ma_window})", color='purple')
plt.xlabel("Episode")
plt.ylabel("Total Degradation Cost ($)")
plt.title("MAPPO: Battery Degradation Cost Moving Average")
plt.legend()
plt.grid(True)
plt.savefig(os.path.join(plots_dir, "degradation_cost_ma.png"), dpi=200)
plt.close()
# Final confirmation message
print(f"\nAll moving-average plots saved to: {plots_dir}")
# ─── Save Final Logs to CSV ───
# 1. Add the total training time as a new row to the DataFrame
total_time_row = pd.DataFrame([{
"Episode": "Total_Training_Time",
"Episode_Duration": total_training_time
}])
df_to_save = pd.concat([df_final_log, total_time_row], ignore_index=True)
# 2. Define the path for the final CSV file.
log_csv_path = os.path.join(logs_dir, "training_performance_log.csv")
# 3. Select and reorder columns for the final CSV
columns_to_save = [
"Episode",
"Mean_Reward",
"Total_Reward",
"Cost_Reduction_Pct",
"Episode_Duration",
"battery_degradation_cost_total",
]
df_to_save = df_to_save[columns_to_save]
# 4. Save the comprehensive DataFrame to CSV.
df_to_save.to_csv(log_csv_path, index=False)
print(f"Saved comprehensive training performance log to: {log_csv_path}")
# ─── Final Timings Printout ───
print("\n" + "="*50)
print("TRAINING COMPLETE".center(50))
print(f"Total training time: {total_training_time:.2f} seconds")
print("="*50)
if __name__ == "__main__":
main()