"""
plot_results.py — Results Charts for Hackathon Presentation
============================================================
Generates publication-ready charts from evaluation JSON data.
Saves all charts to results/charts/ as both PNG and SVG.

BASELINE CONSTANTS (FINAL — measured, do not change):
  R1 Llama-3.1-8B zero-shot: easy=0.0100, medium=0.4583, hard=0.0100, avg=0.1594
  R2 Llama-3.1-8B zero-shot: easy=0.3198, medium=0.2443, hard=0.2520, avg=0.2720
  Training model:             Qwen/Qwen2.5-1.5B-Instruct (GRPO, 4-bit QLoRA)

Charts produced:
  1. r1_scores_comparison.png  — R1 Llama baseline vs trained bar chart
  2. r2_scores_comparison.png  — R2 Llama zero-shot vs trained bar chart (no rule-based)
  3. sprint_rewards.png        — Sprint-by-sprint reward for each R2 scenario
  4. improvement_summary.png   — Combined before/after delta chart (main slide chart)
  5. training_curve.png        — GRPO training loss/reward curve (if trainer_state.json present)

Usage:
    # After running evaluate_r2.py --baseline-only:
    python plot_results.py --eval results/r2_evaluation.json

    # With training curve (after train_llm.py):
    python plot_results.py --eval results/r2_evaluation.json \\
                           --trainer results/trained_model/trainer_state.json

    # Hardcode known scores for presentation (no eval file needed):
    python plot_results.py --demo
"""

from __future__ import annotations

import argparse
import json
import os
from pathlib import Path

CHARTS_DIR = Path("results/charts")
CHARTS_DIR.mkdir(parents=True, exist_ok=True)

# ── Measured baselines (FINAL) ────────────────────────────────────────────────
LLAMA_BASELINE_R1 = {
    "easy_sprint":   0.0100,
    "medium_sprint": 0.4583,
    "hard_sprint":   0.0100,
    "average":       0.1594,
}
LLAMA_BASELINE_R2 = {
    "project_easy":   0.3198,
    "project_medium": 0.2443,
    "project_hard":   0.2520,
    "average":        0.2720,
}
TRAINING_MODEL = "Qwen/Qwen2.5-1.5B-Instruct"

# ── Colour palette ────────────────────────────────────────────────────────────
C_LLAMA     = "#6B7280"   # grey     — Llama zero-shot baseline
C_RULE      = "#3B82F6"   # blue     — rule-based baseline
C_TRAINED   = "#10B981"   # green    — trained Qwen (post-GRPO)
C_EASY      = "#60A5FA"
C_MEDIUM    = "#F59E0B"
C_HARD      = "#EF4444"
C_BG        = "#F9FAFB"
C_TEXT      = "#111827"


def _setup_matplotlib():
    import matplotlib
    matplotlib.use("Agg")   # headless
    import matplotlib.pyplot as plt
    import matplotlib.patches as mpatches
    plt.rcParams.update({
        "figure.facecolor":  C_BG,
        "axes.facecolor":    C_BG,
        "axes.edgecolor":    "#D1D5DB",
        "axes.labelcolor":   C_TEXT,
        "text.color":        C_TEXT,
        "xtick.color":       C_TEXT,
        "ytick.color":       C_TEXT,
        "grid.color":        "#E5E7EB",
        "grid.linestyle":    "--",
        "grid.alpha":        0.7,
        "font.family":       "sans-serif",
        "font.size":         11,
        "axes.titlesize":    13,
        "axes.titleweight":  "bold",
        "figure.dpi":        150,
    })
    return plt, mpatches


def save(plt, name: str):
    png = CHARTS_DIR / f"{name}.png"
    svg = CHARTS_DIR / f"{name}.svg"
    plt.tight_layout()
    plt.savefig(png, bbox_inches="tight")
    plt.savefig(svg, bbox_inches="tight")
    plt.close()
    print(f"  [SAVED] {png}", flush=True)


# ── Chart 1: R1 scores comparison ────────────────────────────────────────────

def chart_r1_comparison(eval_data: dict):
    plt, mpatches = _setup_matplotlib()

    tasks      = ["easy_sprint", "medium_sprint", "hard_sprint"]
    labels     = ["Easy Sprint", "Medium Sprint", "Hard Sprint"]
    llama_base = [eval_data.get("r1_llama_baseline", LLAMA_BASELINE_R1).get(t, 0) for t in tasks]
    rule_based = [eval_data.get("r1_rule_based", {}).get(t, {}).get("avg_score", 0) for t in tasks]
    llm_scores = [eval_data.get("r1_llm", {}).get(t, {}).get("avg_score", 0) for t in tasks]

    has_llm = any(v > 0 for v in llm_scores)
    x      = range(len(tasks))
    width  = 0.28 if has_llm else 0.38
    fig, ax = plt.subplots(figsize=(9, 5))

    b1 = ax.bar([i - width for i in x], llama_base, width, label=f"Llama-3.1-8B (zero-shot baseline)", color=C_LLAMA,   zorder=3)
    b2 = ax.bar([i         for i in x], rule_based, width, label="Rule-based",                           color=C_RULE,    zorder=3)
    if has_llm:
        b3 = ax.bar([i + width for i in x], llm_scores, width, label=f"{TRAINING_MODEL} (GRPO trained)", color=C_TRAINED, zorder=3)

    def label_bars(bars):
        for bar in bars:
            h = bar.get_height()
            if h > 0.01:
                ax.text(bar.get_x() + bar.get_width()/2, h + 0.01,
                        f"{h:.2f}", ha="center", va="bottom", fontsize=9)

    label_bars(b1); label_bars(b2)
    if has_llm: label_bars(b3)

    ax.set_xticks(list(x))
    ax.set_xticklabels(labels)
    ax.set_ylim(0, 1.15)
    ax.set_ylabel("Score (0.01 – 0.99)")
    ax.set_title("Round 1 — Score Comparison")
    ax.legend(loc="upper right")
    ax.yaxis.grid(True, zorder=0)
    ax.set_axisbelow(True)

    save(plt, "r1_scores_comparison")


# ── Chart 2: R2 scores comparison ────────────────────────────────────────────

def chart_r2_comparison(eval_data: dict):
    plt, mpatches = _setup_matplotlib()

    tasks      = ["project_easy", "project_medium", "project_hard"]
    labels     = ["Easy (6 sprints)", "Medium (6 sprints)", "Hard (6 sprints)"]
    llama_base = [eval_data.get("r2_llama_baseline", LLAMA_BASELINE_R2).get(t, 0) for t in tasks]
    llm_scores = [eval_data.get("r2_llm", {}).get(t, {}).get("avg_score", 0) for t in tasks]

    has_llm = any(v > 0 for v in llm_scores)
    x     = range(len(tasks))
    width = 0.32 if has_llm else 0.5
    fig, ax = plt.subplots(figsize=(9, 5))

    b1 = ax.bar([i - width/2 for i in x], llama_base, width, label="Llama-3.1-8B (zero-shot)", color=C_LLAMA,   zorder=3)
    if has_llm:
        b2 = ax.bar([i + width/2 for i in x], llm_scores, width, label=f"{TRAINING_MODEL} (GRPO)", color=C_TRAINED, zorder=3)

    for bars in ([b1] + ([b2] if has_llm else [])):
        for bar in bars:
            h = bar.get_height()
            if h > 0.01:
                ax.text(bar.get_x() + bar.get_width()/2, h + 0.01,
                        f"{h:.2f}", ha="center", va="bottom", fontsize=9)

    ax.set_xticks(list(x))
    ax.set_xticklabels(labels)
    ax.set_ylim(0, 1.15)
    ax.set_ylabel("Project Score (delivery × instruction × health)")
    ax.set_title("Round 2 — Multi-Sprint Project Score")
    ax.legend(loc="upper right")
    ax.yaxis.grid(True, zorder=0)
    ax.set_axisbelow(True)

    save(plt, "r2_scores_comparison")


# ── Chart 3: Sprint reward curves ─────────────────────────────────────────────

def chart_sprint_rewards(eval_data: dict):
    """Per-sprint reward for each R2 scenario (Llama baseline vs trained)."""
    plt, _ = _setup_matplotlib()
    tasks  = ["project_easy", "project_medium", "project_hard"]
    colors = [C_EASY, C_MEDIUM, C_HARD]
    labels = ["Easy", "Medium", "Hard"]

    fig, axes = plt.subplots(1, 3, figsize=(13, 4), sharey=True)

    for ax, task, color, label in zip(axes, tasks, colors, labels):
        llm_eps = eval_data.get("r2_llm", {}).get(task, {}).get("episodes", [])

        if llm_eps:
            sr = llm_eps[0].get("sprint_rewards", [])
            if sr:
                ax.plot(range(1, len(sr)+1), sr, "o-",
                        color=color, label="Qwen GRPO", linewidth=2, markersize=6)

        ax.set_xlabel("Sprint")
        ax.set_title(f"{label} Project")
        ax.set_xticks(range(1, 7))
        ax.set_ylim(0, 2.2)
        ax.yaxis.grid(True)
        ax.set_axisbelow(True)
        if ax == axes[0]:
            ax.set_ylabel("Sprint Reward")
        ax.legend(fontsize=8)

    fig.suptitle("Sprint-by-Sprint Reward — Qwen GRPO Trained", y=1.02, fontsize=13, fontweight="bold")
    save(plt, "sprint_rewards")


# ── Chart 4: Improvement summary (main presentation slide) ───────────────────

def chart_improvement_summary(eval_data: dict):
    """Main before/after chart. Uses Llama zero-shot as the 'before' bar."""
    plt, _ = _setup_matplotlib()

    all_tasks = (
        [f"R1: {t.replace('_sprint','').title()}" for t in ["easy_sprint","medium_sprint","hard_sprint"]] +
        [f"R2: {t.replace('project_','').title()}" for t in ["project_easy","project_medium","project_hard"]]
    )

    llama_base, trained_scores = [], []
    for t in ["easy_sprint","medium_sprint","hard_sprint"]:
        llama_base.append(eval_data.get("r1_llama_baseline", LLAMA_BASELINE_R1).get(t, 0))
        trained_scores.append(eval_data.get("r1_llm", {}).get(t, {}).get("avg_score", 0))
    for t in ["project_easy","project_medium","project_hard"]:
        llama_base.append(eval_data.get("r2_llama_baseline", LLAMA_BASELINE_R2).get(t, 0))
        trained_scores.append(eval_data.get("r2_llm", {}).get(t, {}).get("avg_score", 0))

    x     = range(len(all_tasks))
    width = 0.36
    fig, ax = plt.subplots(figsize=(11, 5))

    b1 = ax.bar([i - width/2 for i in x], llama_base,     width,
                label="Before: Llama-3.1-8B (zero-shot)", color=C_LLAMA,   zorder=3)
    b2 = ax.bar([i + width/2 for i in x], trained_scores, width,
                label=f"After: {TRAINING_MODEL} (GRPO)",  color=C_TRAINED, zorder=3)

    # Delta arrows and labels
    for i, (base, trained) in enumerate(zip(llama_base, trained_scores)):
        if trained > base + 0.01:
            ax.annotate("", xy=(i + width/2, trained + 0.02), xytext=(i - width/2, base + 0.02),
                        arrowprops=dict(arrowstyle="->", color="#059669", lw=1.5))
            ax.text(i, max(base, trained) + 0.06, f"+{trained-base:.2f}",
                    ha="center", fontsize=8, color="#059669", fontweight="bold")

    ax.set_xticks(list(x))
    ax.set_xticklabels(all_tasks, rotation=15, ha="right")
    ax.set_ylim(0, 1.25)
    ax.set_ylabel("Score")
    ax.set_title(f"Reward Improvement: Llama Zero-Shot → Qwen2.5-1.5B GRPO Trained")
    ax.legend(loc="upper left")
    ax.yaxis.grid(True, zorder=0)
    ax.set_axisbelow(True)

    # Divider between R1 and R2
    ax.axvline(x=2.5, color="#9CA3AF", linestyle=":", linewidth=1.5)
    ax.text(1.0, 1.20, "Round 1", ha="center", fontsize=10, color="#6B7280")
    ax.text(4.0, 1.20, "Round 2", ha="center", fontsize=10, color="#6B7280")

    save(plt, "improvement_summary")


# ── Chart 5: Training loss/reward curve ───────────────────────────────────────

def chart_training_curve(trainer_state_path: str):
    plt, _ = _setup_matplotlib()

    with open(trainer_state_path) as f:
        state = json.load(f)

    log_history = state.get("log_history", [])
    if not log_history:
        print("  [SKIP] No log_history in trainer_state.json", flush=True)
        return

    steps, losses, rewards = [], [], []
    for entry in log_history:
        if "loss" in entry:
            steps.append(entry.get("step", 0))
            losses.append(entry["loss"])
        if "reward" in entry:
            rewards.append((entry.get("step", 0), entry["reward"]))

    fig, axes = plt.subplots(1, 2, figsize=(11, 4))

    if steps and losses:
        axes[0].plot(steps, losses, color=C_RULE, linewidth=2)
        axes[0].set_xlabel("Training Step")
        axes[0].set_ylabel("Loss")
        axes[0].set_title("GRPO Training Loss (Qwen2.5-1.5B)")
        axes[0].yaxis.grid(True)

    if rewards:
        rsteps, rvals = zip(*rewards)
        axes[1].plot(rsteps, rvals, color=C_TRAINED, linewidth=2)
        axes[1].set_xlabel("Training Step")
        axes[1].set_ylabel("Mean Reward")
        axes[1].set_title("GRPO Mean Reward per Step")
        axes[1].yaxis.grid(True)

    fig.suptitle(f"GRPO Training Curves — {TRAINING_MODEL}", fontsize=13, fontweight="bold")
    save(plt, "training_curve")


# ── Demo mode (hardcoded real baselines, placeholder trained scores) ──────────

def demo_mode():
    """
    Generate charts using real measured baselines.
    Trained scores are placeholders — replace with real evaluate_r2.py output after on-site training.
    """
    print("[INFO] Demo mode — real Llama baselines, placeholder trained scores", flush=True)
    print(f"[INFO] Training model: {TRAINING_MODEL}", flush=True)

    # Placeholder trained scores — update after on-site GRPO training
    PLACEHOLDER_R1_TRAINED = {
        "easy_sprint":   0.0,   # update after training
        "medium_sprint": 0.0,   # update after training
        "hard_sprint":   0.0,   # update after training
    }
    PLACEHOLDER_R2_TRAINED = {
        "project_easy":   0.0,  # update after training
        "project_medium": 0.0,  # update after training
        "project_hard":   0.0,  # update after training
    }

    demo_data = {
        "r1_llama_baseline": LLAMA_BASELINE_R1,
        "r2_llama_baseline": LLAMA_BASELINE_R2,
        "r1_rule_based": {
            "easy_sprint":   {"avg_score": 0.92},
            "medium_sprint": {"avg_score": 0.35},
            "hard_sprint":   {"avg_score": 0.01},
        },
        "r1_llm": {
            t: {"avg_score": v} for t, v in PLACEHOLDER_R1_TRAINED.items()
        },
        "r2_rule_based": {},
        "r2_llm": {
            t: {"avg_score": v, "episodes": []}
            for t, v in PLACEHOLDER_R2_TRAINED.items()
        },
        "improvement": {
            t: {
                "llama_baseline": LLAMA_BASELINE_R2[t],
                "trained_llm":    PLACEHOLDER_R2_TRAINED[t],
                "delta_vs_llama": round(PLACEHOLDER_R2_TRAINED[t] - LLAMA_BASELINE_R2[t], 4),
            }
            for t in ["project_easy", "project_medium", "project_hard"]
        },
    }
    return demo_data


# ── CLI ───────────────────────────────────────────────────────────────────────

def main():
    parser = argparse.ArgumentParser(description="Generate result charts for presentation")
    parser.add_argument("--eval",    type=str, default=None,
                        help="Path to r2_evaluation.json from evaluate_r2.py")
    parser.add_argument("--trainer", type=str, default=None,
                        help="Path to trainer_state.json from train_llm.py output")
    parser.add_argument("--demo",    action="store_true",
                        help="Generate charts with real baselines + placeholder trained scores")
    args = parser.parse_args()

    try:
        import matplotlib
    except ImportError:
        print("[ERROR] matplotlib not installed. Run: pip install matplotlib", flush=True)
        import sys; sys.exit(1)

    if args.demo:
        eval_data = demo_mode()
    elif args.eval:
        with open(args.eval) as f:
            eval_data = json.load(f)
        # Back-fill baseline keys if running against old JSON format
        if "r1_llama_baseline" not in eval_data:
            eval_data["r1_llama_baseline"] = LLAMA_BASELINE_R1
        if "r2_llama_baseline" not in eval_data:
            eval_data["r2_llama_baseline"] = LLAMA_BASELINE_R2
    else:
        print("[INFO] No --eval file specified. Using --demo mode.", flush=True)
        eval_data = demo_mode()

    print(f"\nGenerating charts → {CHARTS_DIR}/", flush=True)
    print(f"  Baselines: R1 avg={LLAMA_BASELINE_R1['average']:.4f}  R2 avg={LLAMA_BASELINE_R2['average']:.4f}", flush=True)
    print(f"  Training model: {TRAINING_MODEL}", flush=True)

    print("  Chart 1: R1 scores comparison...", flush=True)
    chart_r1_comparison(eval_data)

    print("  Chart 2: R2 scores comparison...", flush=True)
    chart_r2_comparison(eval_data)

    print("  Chart 3: Sprint reward curves...", flush=True)
    chart_sprint_rewards(eval_data)

    print("  Chart 4: Improvement summary...", flush=True)
    chart_improvement_summary(eval_data)

    if args.trainer and Path(args.trainer).exists():
        print("  Chart 5: Training curve...", flush=True)
        chart_training_curve(args.trainer)
    else:
        print("  Chart 5: Training curve — skipped (no --trainer file provided)", flush=True)

    print(f"\n✅ All charts saved to {CHARTS_DIR}/", flush=True)
    print(f"   Use improvement_summary.png in your HF blog post and pitch slides.", flush=True)


if __name__ == "__main__":
    main()