| import os |
| import sys |
| import re |
| import numpy as np |
| import torch |
| import matplotlib.pyplot as plt |
| import pandas as pd |
| import time |
| from datetime import datetime |
|
|
| sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))) |
|
|
| from solar_sharer_battery_env import SolarSharer |
| from mappo.trainer.mappo import MAPPO |
| |
| def main(): |
|
|
| STATE_TO_RUN = "pennsylvania" |
|
|
| |
| DATA_FILE_PATH = "/Users/ananygupta/Desktop/Final_revision/Australia_data/processed_data_ausgrid_100_houses.csv" |
| num_episodes = 10000 |
| |
| batch_size = 256 |
| checkpoint_interval = 100000 |
| window_size = 32 |
|
|
|
|
| env = SolarSharer( |
| data_path=DATA_FILE_PATH, |
| state=STATE_TO_RUN, |
| time_freq="30T" |
| ) |
| |
| |
| print("Observation space:", env.observation_space) |
| print("Action space :", env.action_space) |
|
|
| |
| obs = env.reset() |
| print(f"Reset returned {len(obs)} agent observations; each obs shape: {np.array(obs).shape}") |
|
|
| |
| dummy_actions = np.random.rand(env.num_agents, env.action_space.shape[1]).astype(np.float32) |
| next_obs, rewards, done, info = env.step(dummy_actions) |
| print(f"Step outputs β next_obs: {len(next_obs)}Γ{np.array(next_obs).shape[1]}, " |
| f"rewards: {len(rewards)}, done: {done}") |
| print("Info keys:", list(info.keys())) |
| |
|
|
| |
| env.group_counts = { |
| 0: env.agent_groups.count(0), |
| 1: env.agent_groups.count(1) |
| } |
| print(f"Number of houses in each group: {env.group_counts}") |
|
|
| max_steps = env.num_steps |
|
|
| |
| num_agents = env.num_agents |
| local_state_dim = env.observation_space.shape[1] |
| action_dim = env.action_space.shape[1] |
|
|
| |
| timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") |
| run_name = f"mappo_{STATE_TO_RUN}_{num_agents}agents_{num_episodes}eps_{timestamp}" |
| root_dir = os.path.join("Testing_with_australia_data", run_name) |
| os.makedirs(root_dir, exist_ok=True) |
| print(f"Saving training outputs to: {root_dir}") |
|
|
| logs_dir = os.path.join(root_dir, "logs") |
| plots_dir = os.path.join(root_dir, "plots") |
| os.makedirs(logs_dir, exist_ok=True) |
| os.makedirs(plots_dir, exist_ok=True) |
|
|
|
|
| |
| mappo = MAPPO( |
| n_agents=num_agents, |
| local_dim=local_state_dim, |
| global_dim=num_agents * local_state_dim, |
| act_dim=action_dim, |
| lr=2e-4, |
| gamma=0.95, |
| lam=0.95, |
| clip_eps=0.2, |
| k_epochs=4, |
| batch_size=batch_size |
| ) |
|
|
|
|
| |
| episode_rewards = [] |
| episode_total_rewards = [] |
| block_mean_rewards = [] |
| block_total_rewards = [] |
|
|
| agent_rewards_log = [[] for _ in range(num_agents)] |
| best_mean_reward = -1e9 |
| best_model_path = os.path.join(logs_dir, "best_model.pth") |
|
|
|
|
| daily_rewards = [] |
| monthly_rewards = [] |
|
|
| training_start_time = time.time() |
| episode_durations = [] |
| total_steps_global = 0 |
| episode_log_data = [] |
| |
| performance_metrics_log = [] |
|
|
|
|
| agent_charge_log = [[] for _ in range(num_agents)] |
| agent_discharge_log = [[] for _ in range(num_agents)] |
|
|
|
|
| |
| for episode in range(1, num_episodes + 1): |
| episode_start_time = time.time() |
|
|
| obs = np.array(env.reset(), dtype=np.float32) |
|
|
|
|
| |
| |
| |
| |
| |
| if episode > 1: |
| |
| last_episode_metrics = env.get_episode_metrics() |
|
|
| |
| last_episode_metrics['Episode'] = episode - 1 |
|
|
| |
| performance_metrics_log.append(last_episode_metrics) |
| |
|
|
| total_reward = np.zeros(num_agents, dtype=np.float32) |
| done = False |
| step_count = 0 |
| day_logs = [] |
| episode_charges = [[] for _ in range(num_agents)] |
| episode_discharges = [[] for _ in range(num_agents)] |
|
|
| while not done: |
|
|
| |
| |
| |
| global_obs = obs.flatten() |
| actions, logps = mappo.select_action(obs, global_obs) |
|
|
| |
| next_obs_list, rewards, done, info = env.step(actions) |
|
|
| |
| next_obs = np.array(next_obs_list, dtype=np.float32) |
| next_global_obs = next_obs.flatten() |
|
|
|
|
| |
| |
| local_obs_arr = np.array(obs, dtype=np.float32) |
|
|
| mappo.store( |
| local_obs_arr, |
| global_obs, |
| actions, |
| logps, |
| rewards, |
| done, |
| next_global_obs |
| ) |
| total_reward += rewards |
| obs = next_obs |
| step_count += 1 |
| total_steps_global += 1 |
|
|
| day_logs.append({ |
| "step": step_count - 1, |
| "grid_import_no_p2p": info["grid_import_no_p2p"], |
| "grid_import_with_p2p": info["grid_import_with_p2p"], |
| "p2p_buy": info["p2p_buy"], |
| "p2p_sell": info["p2p_sell"], |
| "costs": info["costs"], |
| "charge_amount": info.get("charge_amount", np.zeros(num_agents)), |
| "discharge_amount": info.get("discharge_amount", np.zeros(num_agents)) |
| }) |
|
|
| if step_count >= max_steps: |
| break |
|
|
| |
| |
| sum_ep_reward = float(np.sum(total_reward)) |
| mean_ep_reward = float(np.mean(total_reward)) |
|
|
| episode_total_rewards.append(sum_ep_reward) |
| episode_rewards.append(mean_ep_reward) |
| daily_rewards.append(mean_ep_reward) |
|
|
| |
| if len(daily_rewards) % window_size == 0: |
| |
| last_totals = episode_total_rewards[-window_size:] |
| block_sum = sum(last_totals) |
| block_total_rewards.append(block_sum) |
|
|
| |
| last_means = daily_rewards[-window_size:] |
| block_mean = sum(last_means) / window_size |
| block_mean_rewards.append(block_mean) |
|
|
| block_idx = len(block_mean_rewards) |
| print( |
| f"β Completed Block {block_idx} " |
| f"| Episodes { (block_idx-1)*window_size + 1 }β{ block_idx*window_size } " |
| f"| Block Total Reward: {block_sum:.3f} " |
| f"| Block Mean Reward: {block_mean:.3f}" |
| ) |
|
|
| |
| for i in range(num_agents): |
| agent_rewards_log[i].append(total_reward[i]) |
| episode_charges[i].append(actions[i][4]) |
| episode_discharges[i].append(actions[i][5]) |
|
|
| |
| steps_data = [] |
| for entry in day_logs: |
| step_idx = entry["step"] |
| p2p_buy_array = entry["p2p_buy"] |
| p2p_sell_array = entry["p2p_sell"] |
| grid_no_p2p_array = entry["grid_import_no_p2p"] |
| grid_with_p2p_array = entry["grid_import_with_p2p"] |
|
|
| steps_data.append({ |
| "step": step_idx, |
| "p2p_buy_sum": float(np.sum(p2p_buy_array)), |
| "p2p_sell_sum": float(np.sum(p2p_sell_array)), |
| "grid_import_no_p2p_sum": float(np.sum(grid_no_p2p_array)), |
| "grid_import_with_p2p_sum": float(np.sum(grid_with_p2p_array)) |
| }) |
|
|
|
|
| baseline_cost = np.sum([np.sum(entry["grid_import_no_p2p"]) * env.get_grid_price(entry["step"]) |
| for entry in day_logs]) |
| actual_cost = np.sum([np.sum(entry["costs"]) for entry in day_logs]) |
| cost_reduction = (baseline_cost - actual_cost) / baseline_cost |
|
|
| |
| mappo.update() |
|
|
|
|
| |
| if mean_ep_reward > best_mean_reward: |
| best_mean_reward = mean_ep_reward |
| mappo.save(best_model_path) |
|
|
| if episode % checkpoint_interval == 0: |
| ckpt_path = os.path.join(logs_dir, f"checkpoint_{episode}.pth") |
| mappo.save(ckpt_path) |
| |
| episode_end_time = time.time() |
| episode_duration = episode_end_time - episode_start_time |
|
|
| |
| print( |
| f"Episode {episode}/{num_episodes} " |
| f"| Time per Episode: {episode_duration:.2f}s " |
| f"| Steps: {step_count} " |
| f"| Mean Reward: {mean_ep_reward:.3f} " |
| f"| Cost Reduction: {cost_reduction:.2%}" |
| ) |
|
|
| |
| episode_log_data.append({ |
| "Episode": episode, |
| "Steps": step_count, |
| "Mean_Reward": mean_ep_reward, |
| "Total_Reward": sum_ep_reward, |
| "Cost_Reduction_Pct": cost_reduction * 100, |
| "Baseline_Cost": baseline_cost, |
| "Actual_Cost": actual_cost, |
| "Episode_Duration": episode_duration, |
| "Total_Charge": np.sum([np.sum(entry["charge_amount"]) for entry in day_logs]), |
| "Total_Discharge": np.sum([np.sum(entry["discharge_amount"]) for entry in day_logs]) |
| }) |
| for i in range(num_agents): |
| agent_charge_log[i].append(np.mean(episode_charges[i])) |
| agent_discharge_log[i].append(np.mean(episode_discharges[i])) |
|
|
| |
| |
| |
| |
| final_episode_metrics = env.get_episode_metrics() |
| final_episode_metrics['Episode'] = num_episodes |
| performance_metrics_log.append(final_episode_metrics) |
| |
|
|
|
|
|
|
| |
| training_end_time = time.time() |
| total_training_time = training_end_time - training_start_time |
|
|
| |
| np.save(os.path.join(logs_dir, "agent_rewards.npy"), np.array(agent_rewards_log)) |
| np.save(os.path.join(logs_dir, "mean_rewards.npy"), np.array(episode_rewards)) |
| np.save(os.path.join(logs_dir, "total_rewards.npy"), np.array(episode_total_rewards)) |
|
|
| |
| |
|
|
| |
| df_rewards_log = pd.DataFrame(episode_log_data) |
|
|
| |
| df_perf_log = pd.DataFrame(performance_metrics_log) |
|
|
| |
| |
| df_final_log = pd.merge(df_rewards_log, df_perf_log.drop(columns=[ |
| 'degradation_cost_over_time', |
| 'cost_savings_over_time', |
| 'grid_reduction_over_time' |
| ]), on="Episode") |
|
|
|
|
| |
|
|
| |
| os.makedirs(plots_dir, exist_ok=True) |
|
|
| |
| def moving_avg(series, window): |
| return pd.Series(series).rolling(window=window, center=True, min_periods=1).mean().to_numpy() |
|
|
| |
| ma_window = 300 |
| episodes = np.arange(1, num_episodes + 1) |
|
|
| |
| reward_ma = moving_avg(df_final_log["Mean_Reward"], ma_window) |
| plt.figure(figsize=(8,5)) |
| plt.plot(episodes, reward_ma, linewidth=2, label=f"Mean Reward MA (win={ma_window})") |
| plt.xlabel("Episode") |
| plt.ylabel("Mean Reward") |
| plt.title("MAPPO: Mean Reward Moving Average") |
| plt.legend() |
| plt.grid(True) |
| plt.savefig(os.path.join(plots_dir, "mean_reward_ma.png"), dpi=200) |
| plt.close() |
|
|
| |
| total_ma = moving_avg(df_final_log["Total_Reward"], ma_window) |
| plt.figure(figsize=(8,5)) |
| plt.plot(episodes, total_ma, linewidth=2, label=f"Total Reward MA (win={ma_window})") |
| plt.xlabel("Episode") |
| plt.ylabel("Total Reward") |
| plt.title("MAPPO: Total Reward Moving Average") |
| plt.legend() |
| plt.grid(True) |
| plt.savefig(os.path.join(plots_dir, "total_reward_ma.png"), dpi=200) |
| plt.close() |
|
|
| |
| cost_ma = moving_avg(df_final_log["Cost_Reduction_Pct"], ma_window) |
| plt.figure(figsize=(8,5)) |
| plt.plot(episodes, cost_ma, linewidth=2, label="Cost Reduction MA (%)") |
| plt.xlabel("Episode") |
| plt.ylabel("Cost Reduction (%)") |
| plt.title("MAPPO: Cost Reduction Moving Average") |
| plt.legend() |
| plt.grid(True) |
| plt.savefig(os.path.join(plots_dir, "cost_reduction_ma.png"), dpi=200) |
| plt.close() |
|
|
| |
| degradation_ma = moving_avg(df_final_log["battery_degradation_cost_total"], ma_window) |
| plt.figure(figsize=(8,5)) |
| plt.plot(episodes, degradation_ma, linewidth=2, label=f"Degradation Cost MA (win={ma_window})", color='purple') |
| plt.xlabel("Episode") |
| plt.ylabel("Total Degradation Cost ($)") |
| plt.title("MAPPO: Battery Degradation Cost Moving Average") |
| plt.legend() |
| plt.grid(True) |
| plt.savefig(os.path.join(plots_dir, "degradation_cost_ma.png"), dpi=200) |
| plt.close() |
|
|
|
|
| |
| print(f"\nAll moving-average plots saved to: {plots_dir}") |
|
|
|
|
| |
|
|
| |
| total_time_row = pd.DataFrame([{ |
| "Episode": "Total_Training_Time", |
| "Episode_Duration": total_training_time |
| }]) |
| df_to_save = pd.concat([df_final_log, total_time_row], ignore_index=True) |
|
|
|
|
| |
| log_csv_path = os.path.join(logs_dir, "training_performance_log.csv") |
|
|
| |
| columns_to_save = [ |
| "Episode", |
| "Mean_Reward", |
| "Total_Reward", |
| "Cost_Reduction_Pct", |
| "Episode_Duration", |
| "battery_degradation_cost_total", |
| ] |
| df_to_save = df_to_save[columns_to_save] |
|
|
|
|
| |
| df_to_save.to_csv(log_csv_path, index=False) |
|
|
| print(f"Saved comprehensive training performance log to: {log_csv_path}") |
|
|
| |
| print("\n" + "="*50) |
| print("TRAINING COMPLETE".center(50)) |
| print(f"Total training time: {total_training_time:.2f} seconds") |
| print("="*50) |
|
|
|
|
| if __name__ == "__main__": |
| main() |