File size: 17,509 Bytes

55da406

import os
import sys
import re             # ← add thist
import numpy as np
import torch
import matplotlib.pyplot as plt
import pandas as pd
import time
from datetime import datetime

sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "..")))

from solar_sharer_battery_env import SolarSharer
from mappo.trainer.mappo import MAPPO
 
def main():

    STATE_TO_RUN = "pennsylvania" # "pennsylvania" or "colorado" or "oklahoma"

    # --- Set the path to your training data ---
    DATA_FILE_PATH = "/Users/ananygupta/Desktop/Final_revision/Australia_data/processed_data_ausgrid_100_houses.csv"
    num_episodes = 10000
    # total # of episodes you want to run
    batch_size = 256    # e.g. 512, 1024, 2048
    checkpoint_interval = 100000
    window_size = 32        # ← group episodes in blocks of 30


    env = SolarSharer(
        data_path=DATA_FILE_PATH,
        state=STATE_TO_RUN,
        time_freq="30T"
    )
    ############################################################################################
    # ─── Sanity check: env I/O shapes ─────────────────────────────────────
    print("Observation space:", env.observation_space)
    print("Action space     :", env.action_space)

    # Reset and inspect obs
    obs = env.reset()
    print(f"Reset returned {len(obs)} agent observations; each obs shape: {np.array(obs).shape}")

    # Sample random actions and do one step
    dummy_actions = np.random.rand(env.num_agents, env.action_space.shape[1]).astype(np.float32)
    next_obs, rewards, done, info = env.step(dummy_actions)
    print(f"Step outputs → next_obs: {len(next_obs)}×{np.array(next_obs).shape[1]}, "
          f"rewards: {len(rewards)}, done: {done}")
    print("Info keys:", list(info.keys()))
    # ────────────────────────────────────────────────────────────────

    # Count the number of houses in each group
    env.group_counts = {
        0: env.agent_groups.count(0),
        1: env.agent_groups.count(1)
    }
    print(f"Number of houses in each group: {env.group_counts}")

    max_steps = env.num_steps

    # dims from the env
    num_agents = env.num_agents
    local_state_dim = env.observation_space.shape[1]
    action_dim = env.action_space.shape[1]

    # ─── Build a unique run directory ───────────────────────────
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    run_name = f"mappo_{STATE_TO_RUN}_{num_agents}agents_{num_episodes}eps_{timestamp}"
    root_dir  = os.path.join("Testing_with_australia_data", run_name)
    os.makedirs(root_dir, exist_ok=True)
    print(f"Saving training outputs to: {root_dir}")

    logs_dir  = os.path.join(root_dir, "logs")
    plots_dir = os.path.join(root_dir, "plots")
    os.makedirs(logs_dir, exist_ok=True)
    os.makedirs(plots_dir, exist_ok=True)


    # Create the MeanField agent
    mappo = MAPPO(
        n_agents=num_agents,
        local_dim=local_state_dim,
        global_dim=num_agents * local_state_dim,
        act_dim=action_dim,
        lr=2e-4,
        gamma=0.95,
        lam=0.95,
        clip_eps=0.2,
        k_epochs=4,
        batch_size=batch_size
    )


    # ─────────────── Tracking / Logging Variables ───────────────
    episode_rewards = []             # mean reward per episode (averaged across agents)
    episode_total_rewards = []       # total reward per episode (sum across agents)
    block_mean_rewards = []          # mean of mean-episode-rewards for each block of window_size
    block_total_rewards = []         # sum of total-episode-rewards for each block of window_size

    agent_rewards_log = [[] for _ in range(num_agents)]
    best_mean_reward = -1e9
    best_model_path = os.path.join(logs_dir, "best_model.pth")


    daily_rewards = []               # alias for episode_rewards
    monthly_rewards = []             # just kept in case you want the old logic

    training_start_time = time.time()
    episode_durations = []
    total_steps_global = 0
    episode_log_data = []
    # ADD THIS LINE to store the new metrics from the environment
    performance_metrics_log = [] # This will hold the detailed performance data for each episode.


    agent_charge_log = [[] for _ in range(num_agents)]  # Track charge actions
    agent_discharge_log = [[] for _ in range(num_agents)]  # Track discharge actions


    # ──────────── Training Loop ────────────
    for episode in range(1, num_episodes + 1):
        episode_start_time = time.time()

        obs = np.array(env.reset(), dtype=np.float32)


        # ADD THIS BLOCK to collect metrics from the *previous* episode
        # =================================================================
        # The env.reset() call above finalized the metrics for the episode that just finished.
        # We retrieve them here. We check `if episode > 1` because there are no
        # metrics to collect before the first episode has run.
        if episode > 1:
            # Call the getter method you added to the environment
            last_episode_metrics = env.get_episode_metrics()

            # Add the corresponding episode number for merging later
            last_episode_metrics['Episode'] = episode - 1

            # Append the dictionary of metrics to our new log
            performance_metrics_log.append(last_episode_metrics)
        # =================================================================

        total_reward = np.zeros(num_agents, dtype=np.float32)
        done = False
        step_count = 0
        day_logs = []
        episode_charges = [[] for _ in range(num_agents)]
        episode_discharges = [[] for _ in range(num_agents)]

        while not done:

            # flatten the joint state once per step
            # build global state and pick actions
            # obs is already a NumPy array of shape (num_agents, local_dim)
            global_obs      = obs.flatten()
            actions, logps  = mappo.select_action(obs, global_obs)

            # step environment
            next_obs_list, rewards, done, info = env.step(actions)

            # convert next observations to NumPy array too
            next_obs        = np.array(next_obs_list, dtype=np.float32)
            next_global_obs = next_obs.flatten()


            # store transition
            # ensure fast conversion to torch.Tensor
            local_obs_arr      = np.array(obs,      dtype=np.float32)

            mappo.store(
                local_obs_arr,
                global_obs,
                actions,
                logps,
                rewards,
                done,
                next_global_obs
            )
            total_reward += rewards
            obs = next_obs
            step_count += 1
            total_steps_global += 1

            day_logs.append({
                "step": step_count - 1,
                "grid_import_no_p2p": info["grid_import_no_p2p"],
                "grid_import_with_p2p": info["grid_import_with_p2p"],
                "p2p_buy": info["p2p_buy"],
                "p2p_sell": info["p2p_sell"],
                "costs": info["costs"],  # Capture costs for analysis
                "charge_amount": info.get("charge_amount", np.zeros(num_agents)),  # New
                "discharge_amount": info.get("discharge_amount", np.zeros(num_agents))  # New
            })

            if step_count >= max_steps:
                break

        # ─── After each episode ───
        # 1) Compute per-episode metrics
        sum_ep_reward = float(np.sum(total_reward))        # total reward across all agents for this episode
        mean_ep_reward = float(np.mean(total_reward))      # mean reward across agents for this episode

        episode_total_rewards.append(sum_ep_reward)
        episode_rewards.append(mean_ep_reward)
        daily_rewards.append(mean_ep_reward)

        # 2) If we just finished a block of window_size episodes, aggregate
        if len(daily_rewards) % window_size == 0:
            # Sum of total rewards over the last window_size episodes
            last_totals = episode_total_rewards[-window_size:]
            block_sum = sum(last_totals)
            block_total_rewards.append(block_sum)

            # Mean of mean-episode-rewards over the last window_size episodes
            last_means = daily_rewards[-window_size:]
            block_mean = sum(last_means) / window_size
            block_mean_rewards.append(block_mean)

            block_idx = len(block_mean_rewards)
            print(
                f"→ Completed Block {block_idx} "
                f"| Episodes { (block_idx-1)*window_size + 1 }–{ block_idx*window_size } "
                f"| Block Total Reward: {block_sum:.3f} "
                f"| Block Mean Reward: {block_mean:.3f}"
            )

        # 3) Log agent-level rewards
        for i in range(num_agents):
            agent_rewards_log[i].append(total_reward[i])
            episode_charges[i].append(actions[i][4])
            episode_discharges[i].append(actions[i][5])

        # 4) Summarize P2P steps (unchanged from your original code)
        steps_data = []
        for entry in day_logs:
            step_idx = entry["step"]
            p2p_buy_array  = entry["p2p_buy"]
            p2p_sell_array = entry["p2p_sell"]
            grid_no_p2p_array   = entry["grid_import_no_p2p"]
            grid_with_p2p_array = entry["grid_import_with_p2p"]

            steps_data.append({
                "step": step_idx,
                "p2p_buy_sum":  float(np.sum(p2p_buy_array)),
                "p2p_sell_sum": float(np.sum(p2p_sell_array)),
                "grid_import_no_p2p_sum":   float(np.sum(grid_no_p2p_array)),
                "grid_import_with_p2p_sum": float(np.sum(grid_with_p2p_array))
            })


        baseline_cost = np.sum([np.sum(entry["grid_import_no_p2p"]) * env.get_grid_price(entry["step"])
                                for entry in day_logs])
        actual_cost = np.sum([np.sum(entry["costs"]) for entry in day_logs])
        cost_reduction = (baseline_cost - actual_cost) / baseline_cost

        # at end of episode
        mappo.update()  # Update the MAPPO agent


        # save if best
        if mean_ep_reward > best_mean_reward:
            best_mean_reward = mean_ep_reward
            mappo.save(best_model_path)

        if episode % checkpoint_interval == 0:
            ckpt_path = os.path.join(logs_dir, f"checkpoint_{episode}.pth")
            mappo.save(ckpt_path)
        # CORRECTED TIMING AND LOGGING
        episode_end_time = time.time()
        episode_duration = episode_end_time - episode_start_time

        # Move the print statement here
        print(
            f"Episode {episode}/{num_episodes} "
            f"| Time per Episode: {episode_duration:.2f}s "
            f"| Steps: {step_count} "
            f"| Mean Reward: {mean_ep_reward:.3f} "
            f"| Cost Reduction: {cost_reduction:.2%}"
        )

        # Record data in our per-episode log
        episode_log_data.append({
            "Episode": episode,
            "Steps": step_count,
            "Mean_Reward": mean_ep_reward,
            "Total_Reward": sum_ep_reward,
            "Cost_Reduction_Pct": cost_reduction * 100,  # New
            "Baseline_Cost": baseline_cost,  # New
            "Actual_Cost": actual_cost,  # New
            "Episode_Duration": episode_duration,
            "Total_Charge": np.sum([np.sum(entry["charge_amount"]) for entry in day_logs]),  # New
            "Total_Discharge": np.sum([np.sum(entry["discharge_amount"]) for entry in day_logs])  # New
        })
    for i in range(num_agents):
        agent_charge_log[i].append(np.mean(episode_charges[i]))
        agent_discharge_log[i].append(np.mean(episode_discharges[i]))

    # ADD THIS BLOCK TO CAPTURE THE FINAL EPISODE'S METRICS
    # =================================================================
    # After the loop, the metrics for the final episode (num_episodes) are ready.
    # We collect them here to ensure the log is complete.
    final_episode_metrics = env.get_episode_metrics()
    final_episode_metrics['Episode'] = num_episodes
    performance_metrics_log.append(final_episode_metrics)
    # =================================================================



    # ─── End of all training ───
    training_end_time = time.time()
    total_training_time = training_end_time - training_start_time

    # Save out per-episode agent rewards + mean rewards
    np.save(os.path.join(logs_dir, "agent_rewards.npy"), np.array(agent_rewards_log))
    np.save(os.path.join(logs_dir, "mean_rewards.npy"),  np.array(episode_rewards))
    np.save(os.path.join(logs_dir, "total_rewards.npy"), np.array(episode_total_rewards))

    ################################# PLOTTING & LOGGING ##################################################################
    # ─────────── Create Final DataFrame for Logging and Plotting ───────────

    # 1. Create a DataFrame from the original log data (rewards, costs, etc.)
    df_rewards_log = pd.DataFrame(episode_log_data)

    # 2. Create a DataFrame from the new performance metrics log
    df_perf_log = pd.DataFrame(performance_metrics_log)

    # 3. Merge the two DataFrames on the 'Episode' column.
    #    This combines all metrics into a single table.
    df_final_log = pd.merge(df_rewards_log, df_perf_log.drop(columns=[
        'degradation_cost_over_time',
        'cost_savings_over_time',
        'grid_reduction_over_time'
    ]), on="Episode")


    # ─────────── PLOTTING ───────────

    # Ensure plot directory exists
    os.makedirs(plots_dir, exist_ok=True)

    # Helper: centered moving average
    def moving_avg(series, window):
        return pd.Series(series).rolling(window=window, center=True, min_periods=1).mean().to_numpy()

    # Smoothing window (in episodes)
    ma_window = 300
    episodes = np.arange(1, num_episodes + 1)

    # 1. Mean Reward moving average
    reward_ma = moving_avg(df_final_log["Mean_Reward"], ma_window)
    plt.figure(figsize=(8,5))
    plt.plot(episodes, reward_ma, linewidth=2, label=f"Mean Reward MA (win={ma_window})")
    plt.xlabel("Episode")
    plt.ylabel("Mean Reward")
    plt.title("MAPPO: Mean Reward Moving Average")
    plt.legend()
    plt.grid(True)
    plt.savefig(os.path.join(plots_dir, "mean_reward_ma.png"), dpi=200)
    plt.close()

    # 2. Total Reward moving average
    total_ma = moving_avg(df_final_log["Total_Reward"], ma_window)
    plt.figure(figsize=(8,5))
    plt.plot(episodes, total_ma, linewidth=2, label=f"Total Reward MA (win={ma_window})")
    plt.xlabel("Episode")
    plt.ylabel("Total Reward")
    plt.title("MAPPO: Total Reward Moving Average")
    plt.legend()
    plt.grid(True)
    plt.savefig(os.path.join(plots_dir, "total_reward_ma.png"), dpi=200)
    plt.close()

    # 3. Cost Reduction (%) moving average
    cost_ma = moving_avg(df_final_log["Cost_Reduction_Pct"], ma_window)
    plt.figure(figsize=(8,5))
    plt.plot(episodes, cost_ma, linewidth=2, label="Cost Reduction MA (%)")
    plt.xlabel("Episode")
    plt.ylabel("Cost Reduction (%)")
    plt.title("MAPPO: Cost Reduction Moving Average")
    plt.legend()
    plt.grid(True)
    plt.savefig(os.path.join(plots_dir, "cost_reduction_ma.png"), dpi=200)
    plt.close()

    # 4. Battery Degradation Cost moving average
    degradation_ma = moving_avg(df_final_log["battery_degradation_cost_total"], ma_window)
    plt.figure(figsize=(8,5))
    plt.plot(episodes, degradation_ma, linewidth=2, label=f"Degradation Cost MA (win={ma_window})", color='purple')
    plt.xlabel("Episode")
    plt.ylabel("Total Degradation Cost ($)")
    plt.title("MAPPO: Battery Degradation Cost Moving Average")
    plt.legend()
    plt.grid(True)
    plt.savefig(os.path.join(plots_dir, "degradation_cost_ma.png"), dpi=200)
    plt.close()


    # Final confirmation message
    print(f"\nAll moving-average plots saved to: {plots_dir}")


    # ─── Save Final Logs to CSV ───

    # 1. Add the total training time as a new row to the DataFrame
    total_time_row = pd.DataFrame([{
        "Episode": "Total_Training_Time",
        "Episode_Duration": total_training_time
    }])
    df_to_save = pd.concat([df_final_log, total_time_row], ignore_index=True)


    # 2. Define the path for the final CSV file.
    log_csv_path = os.path.join(logs_dir, "training_performance_log.csv")

    # 3. Select and reorder columns for the final CSV
    columns_to_save = [
        "Episode",
        "Mean_Reward",
        "Total_Reward",
        "Cost_Reduction_Pct",
        "Episode_Duration",
        "battery_degradation_cost_total",
    ]
    df_to_save = df_to_save[columns_to_save]


    # 4. Save the comprehensive DataFrame to CSV.
    df_to_save.to_csv(log_csv_path, index=False)

    print(f"Saved comprehensive training performance log to: {log_csv_path}")

    # ─── Final Timings Printout ───
    print("\n" + "="*50)
    print("TRAINING COMPLETE".center(50))
    print(f"Total training time: {total_training_time:.2f} seconds")
    print("="*50)


if __name__ == "__main__":
    main()