"""Fig 7: error analysis & decision-rule robustness. (a) F1 by author/paper degree bucket -> cold-start error structure. (b) calibration curve of the final model. (c) rank-cutoff ratio vs F1 (with min-max band) vs probability-threshold drift. """ from pathlib import Path import sys import numpy as np import pandas as pd import matplotlib.pyplot as plt from sklearn.calibration import calibration_curve sys.path.insert(0, str(Path(__file__).resolve().parent)) from plot_style import apply, save, PALETTE_DEEP as C # noqa: E402 apply() ROOT = Path(__file__).resolve().parents[2] FIG = ROOT / "reports" / "figures" VR = ROOT / "validation_runs" / "dynamic_seed202" y = np.load(VR / "val_labels_seed202.npy").astype(int) final_oof = np.load(VR / "high_order_graph_stack/rich_rw7_highorder_directed_oof.npy").astype(float) fig, axes = plt.subplots(1, 3, figsize=(16.5, 4.8)) # --- (a) F1 by degree bucket --- buckets = pd.read_csv(VR / "error_group_calibration/error_analysis_buckets.csv") def plot_buckets(ax, btype, color, title): sub = buckets[buckets["bucket_type"] == btype].copy() sub = sub.reset_index(drop=True) ax.bar(range(len(sub)), sub["f1"], color=color, alpha=0.85) ax.set_xticks(range(len(sub))) ax.set_xticklabels(sub["bucket"], rotation=35, ha="right", fontsize=8) for i, f in enumerate(sub["f1"]): ax.text(i, f + 0.01, f"{f:.2f}", ha="center", fontsize=8) ax.set_ylabel("F1") ax.set_ylim(0, 1.05) ax.set_title(title) plot_buckets(axes[0], "author_degree", C[0], "(a) F1 by author degree\n(cold-start authors hardest)") # --- (b) calibration curve --- frac_pos, mean_pred = calibration_curve(y, final_oof, n_bins=10, strategy="quantile") axes[1].plot([0, 1], [0, 1], "k--", lw=1, label="perfect") axes[1].plot(mean_pred, frac_pos, "s-", color=C[2], label="final model") axes[1].set_xlabel("Mean predicted probability") axes[1].set_ylabel("Fraction of positives") axes[1].set_xlim(0, 1) axes[1].set_ylim(0, 1) axes[1].set_title("(b) Calibration (final model)") axes[1].legend(fontsize=9) # --- (c) rank-cutoff ratio vs F1, with min-max band --- ratio = pd.read_csv(ROOT / "validation_runs" / "stack_ratio_analysis.csv").sort_values("ratio") axes[2].plot(ratio["ratio"], ratio["f1_mean"], "-o", color=C[0], label="rank-cutoff F1 (mean)") axes[2].fill_between(ratio["ratio"], ratio["f1_min"], ratio["f1_max"], color=C[0], alpha=0.18, label="min–max band") # probability-threshold drift marker: a val-tuned prob threshold pushes test positive rate to ~0.52 axes[2].axvline(0.52, color=C[3], ls="--", lw=1.5) axes[2].text(0.521, ratio["f1_mean"].min() + 0.0005, "prob-threshold\ndrifts to 0.52", color=C[3], fontsize=8) axes[2].set_xlabel("Positive ratio (rank cutoff)") axes[2].set_ylabel("F1") axes[2].set_title("(c) Rank-cutoff stability") axes[2].legend(fontsize=8.5, loc="lower left") fig.suptitle("Error analysis & decision-rule robustness", y=1.02) save(fig, "fig7_error_analysis", FIG) print("saved fig7_error_analysis")