"""Fig 7: error analysis & decision-rule robustness.

(a) F1 by author/paper degree bucket -> cold-start error structure.
(b) calibration curve of the final model.
(c) rank-cutoff ratio vs F1 (with min-max band) vs probability-threshold drift.
"""
from pathlib import Path
import sys

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.calibration import calibration_curve

sys.path.insert(0, str(Path(__file__).resolve().parent))
from plot_style import apply, save, PALETTE_DEEP as C  # noqa: E402

apply()
ROOT = Path(__file__).resolve().parents[2]
FIG = ROOT / "reports" / "figures"
VR = ROOT / "validation_runs" / "dynamic_seed202"
y = np.load(VR / "val_labels_seed202.npy").astype(int)
final_oof = np.load(VR / "high_order_graph_stack/rich_rw7_highorder_directed_oof.npy").astype(float)

fig, axes = plt.subplots(1, 3, figsize=(16.5, 4.8))

# --- (a) F1 by degree bucket ---
buckets = pd.read_csv(VR / "error_group_calibration/error_analysis_buckets.csv")


def plot_buckets(ax, btype, color, title):
    sub = buckets[buckets["bucket_type"] == btype].copy()
    sub = sub.reset_index(drop=True)
    ax.bar(range(len(sub)), sub["f1"], color=color, alpha=0.85)
    ax.set_xticks(range(len(sub)))
    ax.set_xticklabels(sub["bucket"], rotation=35, ha="right", fontsize=8)
    for i, f in enumerate(sub["f1"]):
        ax.text(i, f + 0.01, f"{f:.2f}", ha="center", fontsize=8)
    ax.set_ylabel("F1")
    ax.set_ylim(0, 1.05)
    ax.set_title(title)


plot_buckets(axes[0], "author_degree", C[0], "(a) F1 by author degree\n(cold-start authors hardest)")

# --- (b) calibration curve ---
frac_pos, mean_pred = calibration_curve(y, final_oof, n_bins=10, strategy="quantile")
axes[1].plot([0, 1], [0, 1], "k--", lw=1, label="perfect")
axes[1].plot(mean_pred, frac_pos, "s-", color=C[2], label="final model")
axes[1].set_xlabel("Mean predicted probability")
axes[1].set_ylabel("Fraction of positives")
axes[1].set_xlim(0, 1)
axes[1].set_ylim(0, 1)
axes[1].set_title("(b) Calibration (final model)")
axes[1].legend(fontsize=9)

# --- (c) rank-cutoff ratio vs F1, with min-max band ---
ratio = pd.read_csv(ROOT / "validation_runs" / "stack_ratio_analysis.csv").sort_values("ratio")
axes[2].plot(ratio["ratio"], ratio["f1_mean"], "-o", color=C[0], label="rank-cutoff F1 (mean)")
axes[2].fill_between(ratio["ratio"], ratio["f1_min"], ratio["f1_max"], color=C[0], alpha=0.18, label="min–max band")
# probability-threshold drift marker: a val-tuned prob threshold pushes test positive rate to ~0.52
axes[2].axvline(0.52, color=C[3], ls="--", lw=1.5)
axes[2].text(0.521, ratio["f1_mean"].min() + 0.0005, "prob-threshold\ndrifts to 0.52", color=C[3], fontsize=8)
axes[2].set_xlabel("Positive ratio (rank cutoff)")
axes[2].set_ylabel("F1")
axes[2].set_title("(c) Rank-cutoff stability")
axes[2].legend(fontsize=8.5, loc="lower left")

fig.suptitle("Error analysis & decision-rule robustness", y=1.02)
save(fig, "fig7_error_analysis", FIG)
print("saved fig7_error_analysis")