import os import sys import re # ← add thist import numpy as np import torch import matplotlib.pyplot as plt import pandas as pd import time from datetime import datetime sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))) from solar_sharer_battery_env import SolarSharer from mappo.trainer.mappo import MAPPO def main(): STATE_TO_RUN = "pennsylvania" # "pennsylvania" or "colorado" or "oklahoma" # --- Set the path to your training data --- DATA_FILE_PATH = "/Users/ananygupta/Desktop/Final_revision/Australia_data/processed_data_ausgrid_100_houses.csv" num_episodes = 10000 # total # of episodes you want to run batch_size = 256 # e.g. 512, 1024, 2048 checkpoint_interval = 100000 window_size = 32 # ← group episodes in blocks of 30 env = SolarSharer( data_path=DATA_FILE_PATH, state=STATE_TO_RUN, time_freq="30T" ) ############################################################################################ # ─── Sanity check: env I/O shapes ───────────────────────────────────── print("Observation space:", env.observation_space) print("Action space :", env.action_space) # Reset and inspect obs obs = env.reset() print(f"Reset returned {len(obs)} agent observations; each obs shape: {np.array(obs).shape}") # Sample random actions and do one step dummy_actions = np.random.rand(env.num_agents, env.action_space.shape[1]).astype(np.float32) next_obs, rewards, done, info = env.step(dummy_actions) print(f"Step outputs → next_obs: {len(next_obs)}×{np.array(next_obs).shape[1]}, " f"rewards: {len(rewards)}, done: {done}") print("Info keys:", list(info.keys())) # ──────────────────────────────────────────────────────────────── # Count the number of houses in each group env.group_counts = { 0: env.agent_groups.count(0), 1: env.agent_groups.count(1) } print(f"Number of houses in each group: {env.group_counts}") max_steps = env.num_steps # dims from the env num_agents = env.num_agents local_state_dim = env.observation_space.shape[1] action_dim = env.action_space.shape[1] # ─── Build a unique run directory ─────────────────────────── timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") run_name = f"mappo_{STATE_TO_RUN}_{num_agents}agents_{num_episodes}eps_{timestamp}" root_dir = os.path.join("Testing_with_australia_data", run_name) os.makedirs(root_dir, exist_ok=True) print(f"Saving training outputs to: {root_dir}") logs_dir = os.path.join(root_dir, "logs") plots_dir = os.path.join(root_dir, "plots") os.makedirs(logs_dir, exist_ok=True) os.makedirs(plots_dir, exist_ok=True) # Create the MeanField agent mappo = MAPPO( n_agents=num_agents, local_dim=local_state_dim, global_dim=num_agents * local_state_dim, act_dim=action_dim, lr=2e-4, gamma=0.95, lam=0.95, clip_eps=0.2, k_epochs=4, batch_size=batch_size ) # ─────────────── Tracking / Logging Variables ─────────────── episode_rewards = [] # mean reward per episode (averaged across agents) episode_total_rewards = [] # total reward per episode (sum across agents) block_mean_rewards = [] # mean of mean-episode-rewards for each block of window_size block_total_rewards = [] # sum of total-episode-rewards for each block of window_size agent_rewards_log = [[] for _ in range(num_agents)] best_mean_reward = -1e9 best_model_path = os.path.join(logs_dir, "best_model.pth") daily_rewards = [] # alias for episode_rewards monthly_rewards = [] # just kept in case you want the old logic training_start_time = time.time() episode_durations = [] total_steps_global = 0 episode_log_data = [] # ADD THIS LINE to store the new metrics from the environment performance_metrics_log = [] # This will hold the detailed performance data for each episode. agent_charge_log = [[] for _ in range(num_agents)] # Track charge actions agent_discharge_log = [[] for _ in range(num_agents)] # Track discharge actions # ──────────── Training Loop ──────────── for episode in range(1, num_episodes + 1): episode_start_time = time.time() obs = np.array(env.reset(), dtype=np.float32) # ADD THIS BLOCK to collect metrics from the *previous* episode # ================================================================= # The env.reset() call above finalized the metrics for the episode that just finished. # We retrieve them here. We check `if episode > 1` because there are no # metrics to collect before the first episode has run. if episode > 1: # Call the getter method you added to the environment last_episode_metrics = env.get_episode_metrics() # Add the corresponding episode number for merging later last_episode_metrics['Episode'] = episode - 1 # Append the dictionary of metrics to our new log performance_metrics_log.append(last_episode_metrics) # ================================================================= total_reward = np.zeros(num_agents, dtype=np.float32) done = False step_count = 0 day_logs = [] episode_charges = [[] for _ in range(num_agents)] episode_discharges = [[] for _ in range(num_agents)] while not done: # flatten the joint state once per step # build global state and pick actions # obs is already a NumPy array of shape (num_agents, local_dim) global_obs = obs.flatten() actions, logps = mappo.select_action(obs, global_obs) # step environment next_obs_list, rewards, done, info = env.step(actions) # convert next observations to NumPy array too next_obs = np.array(next_obs_list, dtype=np.float32) next_global_obs = next_obs.flatten() # store transition # ensure fast conversion to torch.Tensor local_obs_arr = np.array(obs, dtype=np.float32) mappo.store( local_obs_arr, global_obs, actions, logps, rewards, done, next_global_obs ) total_reward += rewards obs = next_obs step_count += 1 total_steps_global += 1 day_logs.append({ "step": step_count - 1, "grid_import_no_p2p": info["grid_import_no_p2p"], "grid_import_with_p2p": info["grid_import_with_p2p"], "p2p_buy": info["p2p_buy"], "p2p_sell": info["p2p_sell"], "costs": info["costs"], # Capture costs for analysis "charge_amount": info.get("charge_amount", np.zeros(num_agents)), # New "discharge_amount": info.get("discharge_amount", np.zeros(num_agents)) # New }) if step_count >= max_steps: break # ─── After each episode ─── # 1) Compute per-episode metrics sum_ep_reward = float(np.sum(total_reward)) # total reward across all agents for this episode mean_ep_reward = float(np.mean(total_reward)) # mean reward across agents for this episode episode_total_rewards.append(sum_ep_reward) episode_rewards.append(mean_ep_reward) daily_rewards.append(mean_ep_reward) # 2) If we just finished a block of window_size episodes, aggregate if len(daily_rewards) % window_size == 0: # Sum of total rewards over the last window_size episodes last_totals = episode_total_rewards[-window_size:] block_sum = sum(last_totals) block_total_rewards.append(block_sum) # Mean of mean-episode-rewards over the last window_size episodes last_means = daily_rewards[-window_size:] block_mean = sum(last_means) / window_size block_mean_rewards.append(block_mean) block_idx = len(block_mean_rewards) print( f"→ Completed Block {block_idx} " f"| Episodes { (block_idx-1)*window_size + 1 }–{ block_idx*window_size } " f"| Block Total Reward: {block_sum:.3f} " f"| Block Mean Reward: {block_mean:.3f}" ) # 3) Log agent-level rewards for i in range(num_agents): agent_rewards_log[i].append(total_reward[i]) episode_charges[i].append(actions[i][4]) episode_discharges[i].append(actions[i][5]) # 4) Summarize P2P steps (unchanged from your original code) steps_data = [] for entry in day_logs: step_idx = entry["step"] p2p_buy_array = entry["p2p_buy"] p2p_sell_array = entry["p2p_sell"] grid_no_p2p_array = entry["grid_import_no_p2p"] grid_with_p2p_array = entry["grid_import_with_p2p"] steps_data.append({ "step": step_idx, "p2p_buy_sum": float(np.sum(p2p_buy_array)), "p2p_sell_sum": float(np.sum(p2p_sell_array)), "grid_import_no_p2p_sum": float(np.sum(grid_no_p2p_array)), "grid_import_with_p2p_sum": float(np.sum(grid_with_p2p_array)) }) baseline_cost = np.sum([np.sum(entry["grid_import_no_p2p"]) * env.get_grid_price(entry["step"]) for entry in day_logs]) actual_cost = np.sum([np.sum(entry["costs"]) for entry in day_logs]) cost_reduction = (baseline_cost - actual_cost) / baseline_cost # at end of episode mappo.update() # Update the MAPPO agent # save if best if mean_ep_reward > best_mean_reward: best_mean_reward = mean_ep_reward mappo.save(best_model_path) if episode % checkpoint_interval == 0: ckpt_path = os.path.join(logs_dir, f"checkpoint_{episode}.pth") mappo.save(ckpt_path) # CORRECTED TIMING AND LOGGING episode_end_time = time.time() episode_duration = episode_end_time - episode_start_time # Move the print statement here print( f"Episode {episode}/{num_episodes} " f"| Time per Episode: {episode_duration:.2f}s " f"| Steps: {step_count} " f"| Mean Reward: {mean_ep_reward:.3f} " f"| Cost Reduction: {cost_reduction:.2%}" ) # Record data in our per-episode log episode_log_data.append({ "Episode": episode, "Steps": step_count, "Mean_Reward": mean_ep_reward, "Total_Reward": sum_ep_reward, "Cost_Reduction_Pct": cost_reduction * 100, # New "Baseline_Cost": baseline_cost, # New "Actual_Cost": actual_cost, # New "Episode_Duration": episode_duration, "Total_Charge": np.sum([np.sum(entry["charge_amount"]) for entry in day_logs]), # New "Total_Discharge": np.sum([np.sum(entry["discharge_amount"]) for entry in day_logs]) # New }) for i in range(num_agents): agent_charge_log[i].append(np.mean(episode_charges[i])) agent_discharge_log[i].append(np.mean(episode_discharges[i])) # ADD THIS BLOCK TO CAPTURE THE FINAL EPISODE'S METRICS # ================================================================= # After the loop, the metrics for the final episode (num_episodes) are ready. # We collect them here to ensure the log is complete. final_episode_metrics = env.get_episode_metrics() final_episode_metrics['Episode'] = num_episodes performance_metrics_log.append(final_episode_metrics) # ================================================================= # ─── End of all training ─── training_end_time = time.time() total_training_time = training_end_time - training_start_time # Save out per-episode agent rewards + mean rewards np.save(os.path.join(logs_dir, "agent_rewards.npy"), np.array(agent_rewards_log)) np.save(os.path.join(logs_dir, "mean_rewards.npy"), np.array(episode_rewards)) np.save(os.path.join(logs_dir, "total_rewards.npy"), np.array(episode_total_rewards)) ################################# PLOTTING & LOGGING ################################################################## # ─────────── Create Final DataFrame for Logging and Plotting ─────────── # 1. Create a DataFrame from the original log data (rewards, costs, etc.) df_rewards_log = pd.DataFrame(episode_log_data) # 2. Create a DataFrame from the new performance metrics log df_perf_log = pd.DataFrame(performance_metrics_log) # 3. Merge the two DataFrames on the 'Episode' column. # This combines all metrics into a single table. df_final_log = pd.merge(df_rewards_log, df_perf_log.drop(columns=[ 'degradation_cost_over_time', 'cost_savings_over_time', 'grid_reduction_over_time' ]), on="Episode") # ─────────── PLOTTING ─────────── # Ensure plot directory exists os.makedirs(plots_dir, exist_ok=True) # Helper: centered moving average def moving_avg(series, window): return pd.Series(series).rolling(window=window, center=True, min_periods=1).mean().to_numpy() # Smoothing window (in episodes) ma_window = 300 episodes = np.arange(1, num_episodes + 1) # 1. Mean Reward moving average reward_ma = moving_avg(df_final_log["Mean_Reward"], ma_window) plt.figure(figsize=(8,5)) plt.plot(episodes, reward_ma, linewidth=2, label=f"Mean Reward MA (win={ma_window})") plt.xlabel("Episode") plt.ylabel("Mean Reward") plt.title("MAPPO: Mean Reward Moving Average") plt.legend() plt.grid(True) plt.savefig(os.path.join(plots_dir, "mean_reward_ma.png"), dpi=200) plt.close() # 2. Total Reward moving average total_ma = moving_avg(df_final_log["Total_Reward"], ma_window) plt.figure(figsize=(8,5)) plt.plot(episodes, total_ma, linewidth=2, label=f"Total Reward MA (win={ma_window})") plt.xlabel("Episode") plt.ylabel("Total Reward") plt.title("MAPPO: Total Reward Moving Average") plt.legend() plt.grid(True) plt.savefig(os.path.join(plots_dir, "total_reward_ma.png"), dpi=200) plt.close() # 3. Cost Reduction (%) moving average cost_ma = moving_avg(df_final_log["Cost_Reduction_Pct"], ma_window) plt.figure(figsize=(8,5)) plt.plot(episodes, cost_ma, linewidth=2, label="Cost Reduction MA (%)") plt.xlabel("Episode") plt.ylabel("Cost Reduction (%)") plt.title("MAPPO: Cost Reduction Moving Average") plt.legend() plt.grid(True) plt.savefig(os.path.join(plots_dir, "cost_reduction_ma.png"), dpi=200) plt.close() # 4. Battery Degradation Cost moving average degradation_ma = moving_avg(df_final_log["battery_degradation_cost_total"], ma_window) plt.figure(figsize=(8,5)) plt.plot(episodes, degradation_ma, linewidth=2, label=f"Degradation Cost MA (win={ma_window})", color='purple') plt.xlabel("Episode") plt.ylabel("Total Degradation Cost ($)") plt.title("MAPPO: Battery Degradation Cost Moving Average") plt.legend() plt.grid(True) plt.savefig(os.path.join(plots_dir, "degradation_cost_ma.png"), dpi=200) plt.close() # Final confirmation message print(f"\nAll moving-average plots saved to: {plots_dir}") # ─── Save Final Logs to CSV ─── # 1. Add the total training time as a new row to the DataFrame total_time_row = pd.DataFrame([{ "Episode": "Total_Training_Time", "Episode_Duration": total_training_time }]) df_to_save = pd.concat([df_final_log, total_time_row], ignore_index=True) # 2. Define the path for the final CSV file. log_csv_path = os.path.join(logs_dir, "training_performance_log.csv") # 3. Select and reorder columns for the final CSV columns_to_save = [ "Episode", "Mean_Reward", "Total_Reward", "Cost_Reduction_Pct", "Episode_Duration", "battery_degradation_cost_total", ] df_to_save = df_to_save[columns_to_save] # 4. Save the comprehensive DataFrame to CSV. df_to_save.to_csv(log_csv_path, index=False) print(f"Saved comprehensive training performance log to: {log_csv_path}") # ─── Final Timings Printout ─── print("\n" + "="*50) print("TRAINING COMPLETE".center(50)) print(f"Total training time: {total_training_time:.2f} seconds") print("="*50) if __name__ == "__main__": main()