#!/usr/bin/env python3
"""
GridMind-RL Training Curve Plotter
----------------------------------
Reads the training CSV generated by train_unsloth.py and creates a 
beautiful PNG plot of the reward components to prove learning.
Also overlays baseline reference lines.
"""

import argparse
import os
import json
import pandas as pd
import matplotlib.pyplot as plt

def load_heuristic_scores():
    """Load heuristic baseline scores."""
    path = "results/baseline_scores_heuristic.json"
    if os.path.exists(path):
        with open(path) as f:
            return json.load(f)
    return None

def main():
    parser = argparse.ArgumentParser(description="Plot training learning curves")
    parser.add_argument("--csv", type=str, default="results/training_log.csv", help="Path to training CSV")
    parser.add_argument("--output", type=str, default="results/training_curve.png", help="Path to save PNG")
    args = parser.parse_args()
    os.makedirs(os.path.dirname(args.output) or ".", exist_ok=True)

    heuristic_data = load_heuristic_scores()

    if not os.path.exists(args.csv):
        print("No CSV found.")
        return

    print(f"Reading training logs from {args.csv}")
    df = pd.read_csv(args.csv)
    if "step" not in df.columns:
        print("No 'step' column found.")
        return

    # Get baseline scores from our real runs
    h_avg = 0.514  # overall heuristic average from real runs
    if heuristic_data:
        h_avg = heuristic_data.get("overall_average", 0.514)

    plt.style.use("dark_background")
    fig, axes = plt.subplots(1, 2, figsize=(14, 5))

    # Left: Episode score (from /grade)
    ax = axes[0]
    episode_col = "rewards/reward_env_interaction/mean"
    if episode_col in df.columns:
        raw = df[episode_col]
        smooth = raw.rolling(window=5, min_periods=1).mean()
        ax.plot(df["step"], raw, alpha=0.25, color="#4ECDC4", label="Raw")
        ax.plot(df["step"], smooth, color="#4ECDC4", linewidth=2.5, label="Trained LLM (smoothed)")
        ax.axhline(y=h_avg, color="#FF6B6B", linestyle="--", linewidth=2,
                   label=f"Heuristic baseline ({h_avg:.3f})")
        ax.set_xlabel("Training Step", fontsize=11, color="#e6edf3")
        ax.set_ylabel("Episode Score (0.0-1.0)", fontsize=11, color="#e6edf3")
        ax.set_title("Episode Score from /grade Endpoint\n(Higher = Better Energy Management)",
                     fontsize=12, color="#e6edf3")
        ax.legend(fontsize=10)
        ax.grid(True, linestyle="--", alpha=0.3, color="#8b949e")
        ax.set_ylim(0.35, 0.75)
        print(f"Episode score: {raw.iloc[0]:.3f} -> {smooth.dropna().iloc[-1]:.3f}")

    # Right: JSON validity
    ax2 = axes[1]
    json_col = "rewards/reward_json_valid/mean"
    if json_col in df.columns:
        raw = df[json_col]
        smooth = raw.rolling(window=5, min_periods=1).mean()
        ax2.plot(df["step"], raw, alpha=0.25, color="#FFE66D", label="Raw")
        ax2.plot(df["step"], smooth, color="#FFE66D", linewidth=2.5, label="JSON Validity (smoothed)")
        ax2.set_xlabel("Training Step", fontsize=11, color="#e6edf3")
        ax2.set_ylabel("JSON Format Reward (0.0-0.2)", fontsize=11, color="#e6edf3")
        ax2.set_title("Action Format Compliance\n(Higher = Better JSON Output)",
                      fontsize=12, color="#e6edf3")
        ax2.legend(fontsize=10)
        ax2.grid(True, linestyle="--", alpha=0.3, color="#8b949e")
        ax2.set_ylim(0, 0.22)

    plt.tight_layout()
    plt.savefig(args.output, dpi=150, bbox_inches="tight", facecolor="#0d1117")
    print(f"Training curve saved to {args.output}")

if __name__ == "__main__":
    main()