File size: 2,998 Bytes
f28d994 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 | """Fig 7: error analysis & decision-rule robustness.
(a) F1 by author/paper degree bucket -> cold-start error structure.
(b) calibration curve of the final model.
(c) rank-cutoff ratio vs F1 (with min-max band) vs probability-threshold drift.
"""
from pathlib import Path
import sys
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.calibration import calibration_curve
sys.path.insert(0, str(Path(__file__).resolve().parent))
from plot_style import apply, save, PALETTE_DEEP as C # noqa: E402
apply()
ROOT = Path(__file__).resolve().parents[2]
FIG = ROOT / "reports" / "figures"
VR = ROOT / "validation_runs" / "dynamic_seed202"
y = np.load(VR / "val_labels_seed202.npy").astype(int)
final_oof = np.load(VR / "high_order_graph_stack/rich_rw7_highorder_directed_oof.npy").astype(float)
fig, axes = plt.subplots(1, 3, figsize=(16.5, 4.8))
# --- (a) F1 by degree bucket ---
buckets = pd.read_csv(VR / "error_group_calibration/error_analysis_buckets.csv")
def plot_buckets(ax, btype, color, title):
sub = buckets[buckets["bucket_type"] == btype].copy()
sub = sub.reset_index(drop=True)
ax.bar(range(len(sub)), sub["f1"], color=color, alpha=0.85)
ax.set_xticks(range(len(sub)))
ax.set_xticklabels(sub["bucket"], rotation=35, ha="right", fontsize=8)
for i, f in enumerate(sub["f1"]):
ax.text(i, f + 0.01, f"{f:.2f}", ha="center", fontsize=8)
ax.set_ylabel("F1")
ax.set_ylim(0, 1.05)
ax.set_title(title)
plot_buckets(axes[0], "author_degree", C[0], "(a) F1 by author degree\n(cold-start authors hardest)")
# --- (b) calibration curve ---
frac_pos, mean_pred = calibration_curve(y, final_oof, n_bins=10, strategy="quantile")
axes[1].plot([0, 1], [0, 1], "k--", lw=1, label="perfect")
axes[1].plot(mean_pred, frac_pos, "s-", color=C[2], label="final model")
axes[1].set_xlabel("Mean predicted probability")
axes[1].set_ylabel("Fraction of positives")
axes[1].set_xlim(0, 1)
axes[1].set_ylim(0, 1)
axes[1].set_title("(b) Calibration (final model)")
axes[1].legend(fontsize=9)
# --- (c) rank-cutoff ratio vs F1, with min-max band ---
ratio = pd.read_csv(ROOT / "validation_runs" / "stack_ratio_analysis.csv").sort_values("ratio")
axes[2].plot(ratio["ratio"], ratio["f1_mean"], "-o", color=C[0], label="rank-cutoff F1 (mean)")
axes[2].fill_between(ratio["ratio"], ratio["f1_min"], ratio["f1_max"], color=C[0], alpha=0.18, label="min–max band")
# probability-threshold drift marker: a val-tuned prob threshold pushes test positive rate to ~0.52
axes[2].axvline(0.52, color=C[3], ls="--", lw=1.5)
axes[2].text(0.521, ratio["f1_mean"].min() + 0.0005, "prob-threshold\ndrifts to 0.52", color=C[3], fontsize=8)
axes[2].set_xlabel("Positive ratio (rank cutoff)")
axes[2].set_ylabel("F1")
axes[2].set_title("(c) Rank-cutoff stability")
axes[2].legend(fontsize=8.5, loc="lower left")
fig.suptitle("Error analysis & decision-rule robustness", y=1.02)
save(fig, "fig7_error_analysis", FIG)
print("saved fig7_error_analysis")
|