cs3319-project2 / code /figures /fig7_error_analysis.py

CS3319 Project 2 final deliverable (public F1 = 0.96626)

f28d994 12 days ago

3 kB

	"""Fig 7: error analysis & decision-rule robustness.

	(a) F1 by author/paper degree bucket -> cold-start error structure.
	(b) calibration curve of the final model.
	(c) rank-cutoff ratio vs F1 (with min-max band) vs probability-threshold drift.
	"""
	from pathlib import Path
	import sys

	import numpy as np
	import pandas as pd
	import matplotlib.pyplot as plt
	from sklearn.calibration import calibration_curve

	sys.path.insert(0, str(Path(__file__).resolve().parent))
	from plot_style import apply, save, PALETTE_DEEP as C # noqa: E402

	apply()
	ROOT = Path(__file__).resolve().parents[2]
	FIG = ROOT / "reports" / "figures"
	VR = ROOT / "validation_runs" / "dynamic_seed202"
	y = np.load(VR / "val_labels_seed202.npy").astype(int)
	final_oof = np.load(VR / "high_order_graph_stack/rich_rw7_highorder_directed_oof.npy").astype(float)

	fig, axes = plt.subplots(1, 3, figsize=(16.5, 4.8))

	# --- (a) F1 by degree bucket ---
	buckets = pd.read_csv(VR / "error_group_calibration/error_analysis_buckets.csv")


	def plot_buckets(ax, btype, color, title):
	sub = buckets[buckets["bucket_type"] == btype].copy()
	sub = sub.reset_index(drop=True)
	ax.bar(range(len(sub)), sub["f1"], color=color, alpha=0.85)
	ax.set_xticks(range(len(sub)))
	ax.set_xticklabels(sub["bucket"], rotation=35, ha="right", fontsize=8)
	for i, f in enumerate(sub["f1"]):
	ax.text(i, f + 0.01, f"{f:.2f}", ha="center", fontsize=8)
	ax.set_ylabel("F1")
	ax.set_ylim(0, 1.05)
	ax.set_title(title)


	plot_buckets(axes[0], "author_degree", C[0], "(a) F1 by author degree\n(cold-start authors hardest)")

	# --- (b) calibration curve ---
	frac_pos, mean_pred = calibration_curve(y, final_oof, n_bins=10, strategy="quantile")
	axes[1].plot([0, 1], [0, 1], "k--", lw=1, label="perfect")
	axes[1].plot(mean_pred, frac_pos, "s-", color=C[2], label="final model")
	axes[1].set_xlabel("Mean predicted probability")
	axes[1].set_ylabel("Fraction of positives")
	axes[1].set_xlim(0, 1)
	axes[1].set_ylim(0, 1)
	axes[1].set_title("(b) Calibration (final model)")
	axes[1].legend(fontsize=9)

	# --- (c) rank-cutoff ratio vs F1, with min-max band ---
	ratio = pd.read_csv(ROOT / "validation_runs" / "stack_ratio_analysis.csv").sort_values("ratio")
	axes[2].plot(ratio["ratio"], ratio["f1_mean"], "-o", color=C[0], label="rank-cutoff F1 (mean)")
	axes[2].fill_between(ratio["ratio"], ratio["f1_min"], ratio["f1_max"], color=C[0], alpha=0.18, label="min–max band")
	# probability-threshold drift marker: a val-tuned prob threshold pushes test positive rate to ~0.52
	axes[2].axvline(0.52, color=C[3], ls="--", lw=1.5)
	axes[2].text(0.521, ratio["f1_mean"].min() + 0.0005, "prob-threshold\ndrifts to 0.52", color=C[3], fontsize=8)
	axes[2].set_xlabel("Positive ratio (rank cutoff)")
	axes[2].set_ylabel("F1")
	axes[2].set_title("(c) Rank-cutoff stability")
	axes[2].legend(fontsize=8.5, loc="lower left")

	fig.suptitle("Error analysis & decision-rule robustness", y=1.02)
	save(fig, "fig7_error_analysis", FIG)
	print("saved fig7_error_analysis")