import os import sys import re import numpy as np import torch import matplotlib.pyplot as plt import pandas as pd import time from datetime import datetime sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))) from solar_sys_environment import SolarSys from PG.trainer.pg import PGAgent def main(): STATE_TO_RUN = "pennsylvania" # "pennsylvania" or "colorado" or "oklahoma" # Set the path to your training data DATA_FILE_PATH = "/path/to/project/training/5houses_152days_TRAIN.csv" num_episodes = 10000 batch_size = 256 checkpoint_interval = 100000 window_size = 32 env = SolarSys( data_path=DATA_FILE_PATH, state=STATE_TO_RUN, time_freq="15T" ) # Sanity check: env I/O shapes print("Observation space:", env.observation_space) print("Action space :", env.action_space) # Reset and inspect obs obs = env.reset() print(f"Reset returned {len(obs)} agent observations; each obs shape: {np.array(obs).shape}") # Sample random actions and do one step dummy_actions = np.random.rand(env.num_agents, env.action_space.shape[1]).astype(np.float32) next_obs, rewards, done, info = env.step(dummy_actions) print(f"Step outputs → next_obs: {len(next_obs)}×{np.array(next_obs).shape[1]}, " f"rewards: {len(rewards)}, done: {done}") print("Info keys:", list(info.keys())) # Count the number of houses in each group env.group_counts = { 0: env.agent_groups.count(0), 1: env.agent_groups.count(1) } print(f"Number of houses in each group: {env.group_counts}") max_steps = env.num_steps # Dims from the env num_agents = env.num_agents local_state_dim = env.observation_space.shape[1] action_dim = env.action_space.shape[1] # Build a unique run directory timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") run_name = f"pg_{STATE_TO_RUN}_{num_agents}agents_{num_episodes}eps_{timestamp}" root_dir = os.path.join("FINALE_FINALE_FINALE", run_name) os.makedirs(root_dir, exist_ok=True) print(f"Saving training outputs to: {root_dir}") logs_dir = os.path.join(root_dir, "logs") plots_dir = os.path.join(root_dir, "plots") os.makedirs(logs_dir, exist_ok=True) os.makedirs(plots_dir, exist_ok=True) # Create PG agents with use_baseline parameter pg_agents = [ PGAgent( state_dim=local_state_dim, action_dim=action_dim, lr=2e-4, gamma=0.95, critic_loss_coef=0.5 ) for _ in range(num_agents) ] # Tracking / Logging Variables episode_rewards = [] episode_total_rewards = [] block_mean_rewards = [] block_total_rewards = [] agent_rewards_log = [[] for _ in range(num_agents)] best_mean_reward = -1e9 best_model_path = os.path.join(logs_dir, "best_model.pth") daily_rewards = [] monthly_rewards = [] training_start_time = time.time() episode_durations = [] total_steps_global = 0 episode_log_data = [] performance_metrics_log = [] agent_charge_log = [[] for _ in range(num_agents)] agent_discharge_log = [[] for _ in range(num_agents)] # Training Loop for episode in range(1, num_episodes + 1): episode_start_time = time.time() obs = np.array(env.reset(), dtype=np.float32) if episode > 1: last_episode_metrics = env.get_episode_metrics() last_episode_metrics['Episode'] = episode - 1 performance_metrics_log.append(last_episode_metrics) total_reward = np.zeros(num_agents, dtype=np.float32) done = False step_count = 0 day_logs = [] episode_charges = [[] for _ in range(num_agents)] episode_discharges = [[] for _ in range(num_agents)] # Main training loop for a single episode while not done: # Action Selection: Each PG agent acts independently actions = [] for i, agent in enumerate(pg_agents): agent_action = agent.select_action(obs[i]) actions.append(agent_action) actions = np.array(actions, dtype=np.float32) # Step the environment next_obs_list, rewards, done, info = env.step(actions) next_obs = np.array(next_obs_list, dtype=np.float32) # Store Rewards: Each agent stores its own reward for i, agent in enumerate(pg_agents): agent.rewards.append(rewards[i]) agent.dones.append(done) total_reward += rewards obs = next_obs step_count += 1 total_steps_global += 1 day_logs.append({ "step": step_count - 1, "grid_import_no_p2p": info["grid_import_no_p2p"], "grid_import_with_p2p": info["grid_import_with_p2p"], "p2p_buy": info["p2p_buy"], "p2p_sell": info["p2p_sell"], "costs": info["costs"], "charge_amount": info.get("charge_amount", np.zeros(num_agents)), "discharge_amount": info.get("discharge_amount", np.zeros(num_agents)) }) # Track actual charge/discharge actions from the environment for i in range(num_agents): episode_charges[i].append(info["charge_amount"][i]) episode_discharges[i].append(info["discharge_amount"][i]) if step_count >= max_steps: break # After each episode sum_ep_reward = float(np.sum(total_reward)) mean_ep_reward = float(np.mean(total_reward)) episode_total_rewards.append(sum_ep_reward) episode_rewards.append(mean_ep_reward) daily_rewards.append(mean_ep_reward) if len(daily_rewards) % window_size == 0: last_totals = episode_total_rewards[-window_size:] block_sum = sum(last_totals) block_total_rewards.append(block_sum) last_means = daily_rewards[-window_size:] block_mean = sum(last_means) / window_size block_mean_rewards.append(block_mean) block_idx = len(block_mean_rewards) print( f"→ Completed Block {block_idx} " f"| Episodes {(block_idx - 1) * window_size + 1}–{block_idx * window_size} " f"| Block Total Reward: {block_sum:.3f} " f"| Block Mean Reward: {block_mean:.3f}" ) for i in range(num_agents): agent_rewards_log[i].append(total_reward[i]) agent_charge_log[i].append(np.mean(episode_charges[i])) agent_discharge_log[i].append(np.mean(episode_discharges[i])) steps_data = [] for entry in day_logs: steps_data.append({ "step": entry["step"], "p2p_buy_sum": float(np.sum(entry["p2p_buy"])), "p2p_sell_sum": float(np.sum(entry["p2p_sell"])), "grid_import_no_p2p_sum": float(np.sum(entry["grid_import_no_p2p"])), "grid_import_with_p2p_sum": float(np.sum(entry["grid_import_with_p2p"])) }) baseline_cost = np.sum([np.sum(entry["grid_import_no_p2p"]) * env.get_grid_price(entry["step"]) for entry in day_logs]) actual_cost = np.sum([np.sum(entry["costs"]) for entry in day_logs]) cost_reduction = (baseline_cost - actual_cost) / (baseline_cost + 1e-8) # UPDATE STEP: Update each PG agent independently for agent in pg_agents: agent.update() # Save best models if mean_ep_reward > best_mean_reward: best_mean_reward = mean_ep_reward for i, agent in enumerate(pg_agents): agent_path = os.path.join(logs_dir, f"best_model_agent_{i}.pth") agent.save(agent_path) if episode % checkpoint_interval == 0: for i, agent in enumerate(pg_agents): ckpt_path = os.path.join(logs_dir, f"checkpoint_{episode}_agent_{i}.pth") agent.save(ckpt_path) episode_end_time = time.time() episode_duration = episode_end_time - episode_start_time print( f"Episode {episode}/{num_episodes} " f"| Time per Episode: {episode_duration:.2f}s " f"| Steps: {step_count} " f"| Mean Reward: {mean_ep_reward:.3f} " f"| Cost Reduction: {cost_reduction:.2%}" ) episode_log_data.append({ "Episode": episode, "Steps": step_count, "Mean_Reward": mean_ep_reward, "Total_Reward": sum_ep_reward, "Cost_Reduction_Pct": cost_reduction * 100, "Baseline_Cost": baseline_cost, "Actual_Cost": actual_cost, "Episode_Duration": episode_duration, "Total_Charge": np.sum([np.sum(entry["charge_amount"]) for entry in day_logs]), "Total_Discharge": np.sum([np.sum(entry["discharge_amount"]) for entry in day_logs]) }) # Periodic performance logging if episode % 100 == 0: avg_reward_last_100 = np.mean(daily_rewards[-100:]) if len(daily_rewards) >= 100 else np.mean(daily_rewards) print(f" → Average reward (last 100 episodes): {avg_reward_last_100:.3f}") # Final episode metrics final_episode_metrics = env.get_episode_metrics() final_episode_metrics['Episode'] = num_episodes performance_metrics_log.append(final_episode_metrics) training_end_time = time.time() total_training_time = training_end_time - training_start_time # Save final models print("\nSaving final models...") for i, agent in enumerate(pg_agents): final_path = os.path.join(logs_dir, f"final_model_agent_{i}.pth") agent.save(final_path) np.save(os.path.join(logs_dir, "agent_rewards.npy"), np.array(agent_rewards_log)) np.save(os.path.join(logs_dir, "mean_rewards.npy"), np.array(episode_rewards)) np.save(os.path.join(logs_dir, "total_rewards.npy"), np.array(episode_total_rewards)) # Create DataFrames df_rewards_log = pd.DataFrame(episode_log_data) df_perf_log = pd.DataFrame(performance_metrics_log) df_final_log = pd.merge(df_rewards_log, df_perf_log.drop(columns=[ 'degradation_cost_over_time', 'cost_savings_over_time', 'grid_reduction_over_time' ]), on="Episode") # Helper: centered moving average def moving_avg(series, window): return pd.Series(series).rolling(window=window, center=True, min_periods=1).mean().to_numpy() ma_window = 300 episodes = np.arange(1, num_episodes + 1) # Mean Reward moving average reward_ma = moving_avg(df_final_log["Mean_Reward"], ma_window) plt.figure(figsize=(8, 5)) plt.plot(episodes, reward_ma, linewidth=2, label=f"Mean Reward MA (win={ma_window})") plt.xlabel("Episode") plt.ylabel("Mean Reward") plt.title("PG: Mean Reward Moving Average") plt.legend() plt.grid(True) plt.savefig(os.path.join(plots_dir, "mean_reward_ma.png"), dpi=200) plt.close() # Total Reward moving average total_ma = moving_avg(df_final_log["Total_Reward"], ma_window) plt.figure(figsize=(8, 5)) plt.plot(episodes, total_ma, linewidth=2, label=f"Total Reward MA (win={ma_window})") plt.xlabel("Episode") plt.ylabel("Total Reward") plt.title("PG: Total Reward Moving Average") plt.legend() plt.grid(True) plt.savefig(os.path.join(plots_dir, "total_reward_ma.png"), dpi=200) plt.close() # Cost Reduction (%) moving average cost_ma = moving_avg(df_final_log["Cost_Reduction_Pct"], ma_window) plt.figure(figsize=(8, 5)) plt.plot(episodes, cost_ma, linewidth=2, label="Cost Reduction MA (%)") plt.xlabel("Episode") plt.ylabel("Cost Reduction (%)") plt.title("PG: Cost Reduction Moving Average") plt.legend() plt.grid(True) plt.savefig(os.path.join(plots_dir, "cost_reduction_ma.png"), dpi=200) plt.close() # Battery Degradation Cost moving average degradation_ma = moving_avg(df_final_log["battery_degradation_cost_total"], ma_window) plt.figure(figsize=(8, 5)) plt.plot(episodes, degradation_ma, linewidth=2, label=f"Degradation Cost MA (win={ma_window})", color='purple') plt.xlabel("Episode") plt.ylabel("Total Degradation Cost ($)") plt.title("PG: Battery Degradation Cost Moving Average") plt.legend() plt.grid(True) plt.savefig(os.path.join(plots_dir, "degradation_cost_ma.png"), dpi=200) plt.close() print(f"\nAll moving-average plots saved to: {plots_dir}") # Save Final Logs to CSV total_time_row = pd.DataFrame([{ "Episode": "Total_Training_Time", "Episode_Duration": total_training_time }]) df_to_save = pd.concat([df_final_log, total_time_row], ignore_index=True) log_csv_path = os.path.join(logs_dir, "training_performance_log.csv") columns_to_save = [ "Episode", "Mean_Reward", "Total_Reward", "Cost_Reduction_Pct", "Episode_Duration", "battery_degradation_cost_total", ] df_to_save = df_to_save[columns_to_save] df_to_save.to_csv(log_csv_path, index=False) print(f"Saved comprehensive training performance log to: {log_csv_path}") # Final Timings Printout print("\n" + "="*50) print("TRAINING COMPLETE".center(50)) print(f"Total training time: {total_training_time:.2f} seconds") print(f"Device used: {pg_agents[0].device}") print("="*50) if __name__ == "__main__": main()