import os import sys import re import numpy as np import torch import matplotlib.pyplot as plt import pandas as pd import time from datetime import datetime sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))) from solar_sys_environment import SolarSys from maddpg.trainer.maddpg import MADDPG def main(): STATE_TO_RUN = "oklahoma" # "pennsylvania" or "colorado" or "oklahoma" # Set the path to your training data DATA_FILE_PATH = "/path/to/project/training/5houses_152days_TRAIN.csv" num_episodes = 10000 batch_size = 256 checkpoint_interval = 100000 window_size = 32 env = SolarSys( data_path=DATA_FILE_PATH, state=STATE_TO_RUN, time_freq="15T" ) # Sanity check: env I/O shapes print("Observation space:", env.observation_space) print("Action space :", env.action_space) # Reset and inspect obs obs = env.reset() print(f"Reset returned {len(obs)} agent observations; each obs shape: {np.array(obs).shape}") # Sample random actions and do one step dummy_actions = np.random.rand(env.num_agents, env.action_space.shape[1]).astype(np.float32) next_obs, rewards, done, info = env.step(dummy_actions) print(f"Step outputs → next_obs: {len(next_obs)}×{np.array(next_obs).shape[1]}, " f"rewards: {len(rewards)}, done: {done}") print("Info keys:", list(info.keys())) # Count the number of houses in each group env.group_counts = { 0: env.agent_groups.count(0), 1: env.agent_groups.count(1) } print(f"Number of houses in each group: {env.group_counts}") max_steps = env.num_steps # Dims from the env num_agents = env.num_agents local_state_dim = env.observation_space.shape[1] action_dim = env.action_space.shape[1] # Build a unique run directory timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") run_name = f"maddpg_para_sharing_{STATE_TO_RUN}_{num_agents}agents_{num_episodes}eps_{timestamp}" root_dir = os.path.join("FINALE_FINALE_FINALE", run_name) os.makedirs(root_dir, exist_ok=True) print(f"Saving training outputs to: {root_dir}") logs_dir = os.path.join(root_dir, "logs") plots_dir = os.path.join(root_dir, "plots") os.makedirs(logs_dir, exist_ok=True) os.makedirs(plots_dir, exist_ok=True) # Create the MADDPG agent maddpg = MADDPG( num_agents=num_agents, state_dim=local_state_dim, action_dim=action_dim, gamma=0.95, tau=0.01, lr_actor=1e-4, lr_critic=1e-3, buffer_size=1000000, noise_episodes=5000, init_sigma=0.3, final_sigma=0.01, batch_size=batch_size ) # Tracking / Logging Variables episode_rewards = [] episode_total_rewards = [] block_mean_rewards = [] block_total_rewards = [] agent_rewards_log = [[] for _ in range(num_agents)] best_mean_reward = -1e9 best_model_path = os.path.join(logs_dir, "best_model.pth") daily_rewards = [] monthly_rewards = [] training_start_time = time.time() episode_durations = [] total_steps_global = 0 episode_log_data = [] performance_metrics_log = [] agent_charge_log = [[] for _ in range(num_agents)] agent_discharge_log = [[] for _ in range(num_agents)] # Training Loop for episode in range(1, num_episodes + 1): episode_start_time = time.time() obs = np.array(env.reset(), dtype=np.float32) # Collect metrics from the previous episode if episode > 1: last_episode_metrics = env.get_episode_metrics() last_episode_metrics['Episode'] = episode - 1 performance_metrics_log.append(last_episode_metrics) total_reward = np.zeros(num_agents, dtype=np.float32) done = False step_count = 0 day_logs = [] episode_charges = [[] for _ in range(num_agents)] episode_discharges = [[] for _ in range(num_agents)] while not done: # Select actions using the MADDPG agent actions = maddpg.select_actions(obs) # Step environment next_obs_list, rewards, done, info = env.step(actions) next_obs = np.array(next_obs_list, dtype=np.float32) # Store the transition in the replay buffer maddpg.store_transition(obs, actions, rewards, next_obs, done) # Train the agent at every step maddpg.train() total_reward += rewards obs = next_obs step_count += 1 total_steps_global += 1 for i in range(num_agents): episode_charges[i].append(info["charge_amount"][i]) episode_discharges[i].append(info["discharge_amount"][i]) day_logs.append({ "step": step_count - 1, "grid_import_no_p2p": info["grid_import_no_p2p"], "grid_import_with_p2p": info["grid_import_with_p2p"], "p2p_buy": info["p2p_buy"], "p2p_sell": info["p2p_sell"], "costs": info["costs"], "charge_amount": info.get("charge_amount", np.zeros(num_agents)), "discharge_amount": info.get("discharge_amount", np.zeros(num_agents)) }) if step_count >= max_steps: break # After each episode # Compute per-episode metrics sum_ep_reward = float(np.sum(total_reward)) mean_ep_reward = float(np.mean(total_reward)) episode_total_rewards.append(sum_ep_reward) episode_rewards.append(mean_ep_reward) daily_rewards.append(mean_ep_reward) # If we just finished a block of window_size episodes, aggregate if len(daily_rewards) % window_size == 0: last_totals = episode_total_rewards[-window_size:] block_sum = sum(last_totals) block_total_rewards.append(block_sum) last_means = daily_rewards[-window_size:] block_mean = sum(last_means) / window_size block_mean_rewards.append(block_mean) block_idx = len(block_mean_rewards) print( f"→ Completed Block {block_idx} " f"| Episodes {(block_idx-1)*window_size + 1}–{block_idx*window_size} " f"| Block Total Reward: {block_sum:.3f} " f"| Block Mean Reward: {block_mean:.3f}" ) # Log agent-level rewards for i in range(num_agents): agent_rewards_log[i].append(total_reward[i]) agent_charge_log[i].append(np.mean(episode_charges[i])) agent_discharge_log[i].append(np.mean(episode_discharges[i])) # Summarize P2P steps steps_data = [] for entry in day_logs: step_idx = entry["step"] p2p_buy_array = entry["p2p_buy"] p2p_sell_array = entry["p2p_sell"] grid_no_p2p_array = entry["grid_import_no_p2p"] grid_with_p2p_array = entry["grid_import_with_p2p"] steps_data.append({ "step": step_idx, "p2p_buy_sum": float(np.sum(p2p_buy_array)), "p2p_sell_sum": float(np.sum(p2p_sell_array)), "grid_import_no_p2p_sum": float(np.sum(grid_no_p2p_array)), "grid_import_with_p2p_sum": float(np.sum(grid_with_p2p_array)) }) baseline_cost = np.sum([np.sum(entry["grid_import_no_p2p"]) * env.get_grid_price(entry["step"]) for entry in day_logs]) actual_cost = np.sum([np.sum(entry["costs"]) for entry in day_logs]) cost_reduction = (baseline_cost - actual_cost) / baseline_cost # Call on_episode_end() for noise decay schedule maddpg.on_episode_end() # Save if best if mean_ep_reward > best_mean_reward: best_mean_reward = mean_ep_reward maddpg.save(best_model_path) if episode % checkpoint_interval == 0: ckpt_path = os.path.join(logs_dir, f"checkpoint_{episode}.pth") maddpg.save(ckpt_path) episode_end_time = time.time() episode_duration = episode_end_time - episode_start_time print( f"Episode {episode}/{num_episodes} " f"| Time per Episode: {episode_duration:.2f}s " f"| Steps: {step_count} " f"| Mean Reward: {mean_ep_reward:.3f} " f"| Cost Reduction: {cost_reduction:.2%}" ) # Record data in per-episode log episode_log_data.append({ "Episode": episode, "Steps": step_count, "Mean_Reward": mean_ep_reward, "Total_Reward": sum_ep_reward, "Cost_Reduction_Pct": cost_reduction * 100, "Baseline_Cost": baseline_cost, "Actual_Cost": actual_cost, "Episode_Duration": episode_duration, "Total_Charge": np.sum([np.sum(entry["charge_amount"]) for entry in day_logs]), "Total_Discharge": np.sum([np.sum(entry["discharge_amount"]) for entry in day_logs]) }) for i in range(num_agents): agent_charge_log[i].append(np.mean(episode_charges[i])) agent_discharge_log[i].append(np.mean(episode_discharges[i])) # Capture the final episode's metrics final_episode_metrics = env.get_episode_metrics() final_episode_metrics['Episode'] = num_episodes performance_metrics_log.append(final_episode_metrics) # End of all training training_end_time = time.time() total_training_time = training_end_time - training_start_time # Save out per-episode agent rewards + mean rewards np.save(os.path.join(logs_dir, "agent_rewards.npy"), np.array(agent_rewards_log)) np.save(os.path.join(logs_dir, "mean_rewards.npy"), np.array(episode_rewards)) np.save(os.path.join(logs_dir, "total_rewards.npy"), np.array(episode_total_rewards)) # Create Final DataFrame for Logging and Plotting df_rewards_log = pd.DataFrame(episode_log_data) df_perf_log = pd.DataFrame(performance_metrics_log) # Merge the two DataFrames on the 'Episode' column df_final_log = pd.merge(df_rewards_log, df_perf_log.drop(columns=[ 'degradation_cost_over_time', 'cost_savings_over_time', 'grid_reduction_over_time' ]), on="Episode") # PLOTTING os.makedirs(plots_dir, exist_ok=True) # Helper: centered moving average def moving_avg(series, window): return pd.Series(series).rolling(window=window, center=True, min_periods=1).mean().to_numpy() # Smoothing window (in episodes) ma_window = 300 episodes = np.arange(1, num_episodes + 1) # Mean Reward moving average reward_ma = moving_avg(df_final_log["Mean_Reward"], ma_window) plt.figure(figsize=(8, 5)) plt.plot(episodes, reward_ma, linewidth=2, label=f"Mean Reward MA (win={ma_window})") plt.xlabel("Episode") plt.ylabel("Mean Reward") plt.title("MADDPG: Mean Reward Moving Average") plt.legend() plt.grid(True) plt.savefig(os.path.join(plots_dir, "mean_reward_ma.png"), dpi=200) plt.close() # Total Reward moving average total_ma = moving_avg(df_final_log["Total_Reward"], ma_window) plt.figure(figsize=(8, 5)) plt.plot(episodes, total_ma, linewidth=2, label=f"Total Reward MA (win={ma_window})") plt.xlabel("Episode") plt.ylabel("Total Reward") plt.title("MADDPG: Total Reward Moving Average") plt.legend() plt.grid(True) plt.savefig(os.path.join(plots_dir, "total_reward_ma.png"), dpi=200) plt.close() # Cost Reduction (%) moving average cost_ma = moving_avg(df_final_log["Cost_Reduction_Pct"], ma_window) plt.figure(figsize=(8, 5)) plt.plot(episodes, cost_ma, linewidth=2, label="Cost Reduction MA (%)") plt.xlabel("Episode") plt.ylabel("Cost Reduction (%)") plt.title("MADDPG: Cost Reduction Moving Average") plt.legend() plt.grid(True) plt.savefig(os.path.join(plots_dir, "cost_reduction_ma.png"), dpi=200) plt.close() # Battery Degradation Cost moving average degradation_ma = moving_avg(df_final_log["battery_degradation_cost_total"], ma_window) plt.figure(figsize=(8, 5)) plt.plot(episodes, degradation_ma, linewidth=2, label=f"Degradation Cost MA (win={ma_window})", color='purple') plt.xlabel("Episode") plt.ylabel("Total Degradation Cost ($)") plt.title("MADDPG: Battery Degradation Cost Moving Average") plt.legend() plt.grid(True) plt.savefig(os.path.join(plots_dir, "degradation_cost_ma.png"), dpi=200) plt.close() print(f"\nAll moving-average plots saved to: {plots_dir}") # Save Final Logs to CSV total_time_row = pd.DataFrame([{ "Episode": "Total_Training_Time", "Episode_Duration": total_training_time }]) df_to_save = pd.concat([df_final_log, total_time_row], ignore_index=True) log_csv_path = os.path.join(logs_dir, "training_performance_log.csv") # Select and reorder columns for the final CSV columns_to_save = [ "Episode", "Mean_Reward", "Total_Reward", "Cost_Reduction_Pct", "Episode_Duration", "battery_degradation_cost_total", ] df_to_save = df_to_save[columns_to_save] df_to_save.to_csv(log_csv_path, index=False) print(f"Saved comprehensive training performance log to: {log_csv_path}") # Final Timings Printout print("\n" + "="*50) print("TRAINING COMPLETE".center(50)) print(f"Total training time: {total_training_time:.2f} seconds") print("="*50) if __name__ == "__main__": main()