cs3319-project2 / code /post95_ablation.py

CS3319 Project 2 final deliverable (public F1 = 0.96626)

f28d994 12 days ago

14.1 kB

	"""Post-0.95 incremental ablations for the hybrid stacker."""

	from __future__ import annotations

	import argparse
	import importlib.util
	import pickle as pkl
	from pathlib import Path

	import lightgbm as lgb
	import numpy as np
	import pandas as pd
	from sklearn.metrics import precision_recall_curve, roc_auc_score
	from sklearn.model_selection import StratifiedKFold


	def load_module(name: str, path: Path):
	spec = importlib.util.spec_from_file_location(name, path)
	module = importlib.util.module_from_spec(spec)
	assert spec.loader is not None
	spec.loader.exec_module(module)
	return module


	def best_f1(y: np.ndarray, s: np.ndarray):
	p, r, t = precision_recall_curve(y, s)
	f = 2 * p * r / (p + r + 1e-12)
	i = int(np.argmax(f))
	th = float(t[i]) if i < len(t) else 0.5
	return float(f[i]), th, float(roc_auc_score(y, s)), float(p[i]), float(r[i])


	def prf(y: np.ndarray, pred: np.ndarray):
	tp = int(((pred == 1) & (y == 1)).sum())
	fp = int(((pred == 1) & (y == 0)).sum())
	fn = int(((pred == 0) & (y == 1)).sum())
	precision = tp / (tp + fp + 1e-12)
	recall = tp / (tp + fn + 1e-12)
	f1 = 2 * precision * recall / (precision + recall + 1e-12)
	return precision, recall, f1, tp, fp, fn


	def rank01(x: np.ndarray) -> np.ndarray:
	order = np.argsort(x, kind="mergesort")
	out = np.empty(len(x), dtype=np.float32)
	out[order] = np.linspace(0.0, 1.0, len(x), dtype=np.float32)
	return out


	def zscore(x: np.ndarray) -> np.ndarray:
	return ((x - x.mean()) / (x.std() + 1e-8)).astype(np.float32)


	def fit_lgb_oof(X: np.ndarray, y: np.ndarray, seed: int, n_splits: int) -> np.ndarray:
	oof = np.zeros(len(y), dtype=np.float32)
	skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=seed)
	for fold, (tr, va) in enumerate(skf.split(X, y), start=1):
	clf = lgb.LGBMClassifier(
	n_estimators=1200,
	learning_rate=0.025,
	num_leaves=31,
	subsample=0.9,
	colsample_bytree=0.9,
	reg_lambda=5.0,
	min_child_samples=80,
	objective="binary",
	verbose=-1,
	random_state=seed + fold,
	)
	clf.fit(X[tr], y[tr])
	oof[va] = clf.predict_proba(X[va])[:, 1]
	return oof


	def bucket_series(values: np.ndarray, name: str, bins: list[float]) -> pd.Categorical:
	labels = []
	for lo, hi in zip(bins[:-1], bins[1:]):
	left = "-inf" if np.isneginf(lo) else f"{lo:g}"
	right = "inf" if np.isposinf(hi) else f"{hi:g}"
	labels.append(f"{name}[{left},{right})")
	return pd.cut(values, bins=bins, labels=labels, include_lowest=True, right=False)


	def error_analysis(
	y: np.ndarray,
	score: np.ndarray,
	pred: np.ndarray,
	pairs: np.ndarray,
	X_hand: np.ndarray,
	score_lgcn: np.ndarray,
	author_internal_rank: np.ndarray,
	out_dir: Path,
	):
	author_degree = X_hand[:, 0]
	paper_degree = X_hand[:, 1]
	author_rank = pd.Series(pairs[:, 0]).map(pd.Series(np.arange(len(pairs)), index=pairs[:, 0]).groupby(level=0).count()).to_numpy()
	buckets = {
	"author_degree": bucket_series(author_degree, "author_degree", [-np.inf, 1, 3, 8, 20, 50, np.inf]),
	"paper_degree": bucket_series(paper_degree, "paper_degree", [-np.inf, 1, 3, 10, 30, 100, np.inf]),
	"score_lgcn": pd.qcut(score_lgcn, q=10, duplicates="drop"),
	"author_internal_rank": bucket_series(author_internal_rank, "author_internal_rank", [-np.inf, 1, 3, 5, 10, 20, 50, np.inf]),
	"author_candidate_count": bucket_series(author_rank.astype(np.float32), "author_candidate_count", [-np.inf, 5, 10, 20, 50, 100, np.inf]),
	}
	rows = []
	for name, cats in buckets.items():
	for cat in pd.Series(cats).dropna().unique():
	mask = np.asarray(cats == cat)
	if mask.sum() == 0:
	continue
	precision, recall, f1, tp, fp, fn = prf(y[mask], pred[mask])
	rows.append(
	{
	"bucket_type": name,
	"bucket": str(cat),
	"n": int(mask.sum()),
	"positives": int(y[mask].sum()),
	"pred_pos": int(pred[mask].sum()),
	"fp": fp,
	"fn": fn,
	"precision": precision,
	"recall": recall,
	"f1": f1,
	}
	)
	df = pd.DataFrame(rows)
	df.to_csv(out_dir / "error_analysis_buckets.csv", index=False)
	print("\nError analysis buckets:")
	print(df.to_string(index=False, max_rows=80))


	def group_threshold(y: np.ndarray, score: np.ndarray, groups: np.ndarray):
	pred = np.zeros(len(y), dtype=np.int8)
	thresholds = {}
	for g in pd.Series(groups).dropna().unique():
	mask = np.asarray(groups == g)
	if mask.sum() == 0:
	continue
	_, th, _, _, _ = best_f1(y[mask], score[mask])
	pred[mask] = (score[mask] >= th).astype(np.int8)
	thresholds[str(g)] = float(th)
	precision, recall, f1, *_ = prf(y, pred)
	return f1, precision, recall, thresholds, pred


	def author_quota_tuning(y: np.ndarray, score: np.ndarray, pairs: np.ndarray, author_degree: np.ndarray):
	buckets = bucket_series(author_degree, "author_degree", [-np.inf, 1, 3, 8, 20, 50, np.inf])
	best = None
	for base in np.linspace(0.46, 0.54, 17):
	pred = np.zeros(len(y), dtype=np.int8)
	df = pd.DataFrame({"idx": np.arange(len(y)), "author": pairs[:, 0], "score": score, "bucket": buckets})
	# Slightly more permissive for active authors.
	bucket_adj = {
	"author_degree[-inf,1)": -0.04,
	"author_degree[1,3)": -0.02,
	"author_degree[3,8)": 0.00,
	"author_degree[8,20)": 0.01,
	"author_degree[20,50)": 0.02,
	"author_degree[50,inf)": 0.03,
	}
	for _, g in df.groupby("author", sort=False):
	b = str(g["bucket"].iloc[0])
	ratio = min(0.80, max(0.05, base + bucket_adj.get(b, 0.0)))
	k = int(round(len(g) * ratio))
	if k <= 0:
	continue
	idx = g["idx"].to_numpy()
	local = np.argsort(g["score"].to_numpy())[-k:]
	pred[idx[local]] = 1
	precision, recall, f1, *_ = prf(y, pred)
	row = {"base_ratio": float(base), "f1": f1, "precision": precision, "recall": recall, "pred_ratio": float(pred.mean())}
	if best is None or f1 > best["f1"]:
	best = row
	return best


	def negative_evidence_features(X_hand: np.ndarray, score_lgcn: np.ndarray) -> np.ndarray:
	paper_degree = X_hand[:, 1]
	local_overlap = X_hand[:, 3] + X_hand[:, 7] + X_hand[:, 8] + X_hand[:, 12] + X_hand[:, 13] + X_hand[:, 14]
	has_any = (local_overlap > 0).astype(np.float32)
	paper_pct = rank01(paper_degree)
	return np.column_stack(
	[
	has_any,
	score_lgcn * has_any,
	score_lgcn * (1.0 - has_any),
	score_lgcn / np.log1p(paper_degree + 1.0),
	paper_pct,
	paper_degree * X_hand[:, 7],
	paper_degree * X_hand[:, 8],
	paper_degree * X_hand[:, 13],
	]
	).astype(np.float32)


	def topk_content_similarity(root: Path, pairs: np.ndarray, builder) -> np.ndarray:
	cache = root / "validation_runs" / "feature_cache"
	cache.mkdir(parents=True, exist_ok=True)
	key = f"topk_content_{len(pairs)}_{int(pairs[:,0].sum())}_{int(pairs[:,1].sum())}.npy"
	path = cache / key
	if path.exists():
	return np.load(path)
	with (root / "data_and_docs" / "feature.pkl").open("rb") as f:
	feat = pkl.load(f).numpy().astype(np.float32)
	feat /= np.linalg.norm(feat, axis=1, keepdims=True) + 1e-8
	out = np.zeros((len(pairs), 3), dtype=np.float32)
	for i, (a_raw, p_raw) in enumerate(pairs):
	papers = list(builder.author_papers[int(a_raw)])
	if not papers:
	continue
	sims = feat[np.asarray(papers, dtype=np.int64)] @ feat[int(p_raw)]
	sims.sort()
	vals = sims[::-1]
	out[i, 0] = vals[0]
	out[i, 1] = vals[: min(3, len(vals))].mean()
	out[i, 2] = vals[: min(5, len(vals))].mean()
	np.save(path, out)
	return out


	def load_lgcn_variant_scores(root: Path, split_seed: int, y: np.ndarray, max_cols: int = 20):
	files = sorted((root / "validation_runs" / f"dynamic_seed{split_seed}").glob("dyn/scores/val_.npy"))
	rows = []
	for p in files:
	if "hgt" in str(p) or "sage" in str(p) or "bce" in str(p) or "norm" in str(p) or "hinge" in str(p):
	continue
	x = np.load(p).astype(np.float32)
	if len(x) != len(y) or np.std(x) < 1e-8:
	continue
	f1, th, auc, _, _ = best_f1(y, x)
	rows.append((f1, auc, str(p), x))
	rows.sort(key=lambda r: r[0], reverse=True)
	chosen = rows[:max_cols]
	if not chosen:
	return np.zeros((len(y), 0), dtype=np.float32), []
	cols = []
	names = []
	raw_stack = []
	for _, _, name, x in chosen:
	raw_stack.append(x)
	cols.extend([zscore(x), rank01(x)])
	names.extend([name + "::z", name + "::rank"])
	raw = np.vstack(raw_stack)
	cols.extend([zscore(raw.mean(axis=0)), zscore(raw.std(axis=0)), rank01(raw.mean(axis=0))])
	names.extend(["lgcn_variant_mean_z", "lgcn_variant_std_z", "lgcn_variant_mean_rank"])
	return np.column_stack(cols).astype(np.float32), names


	def main():
	parser = argparse.ArgumentParser()
	parser.add_argument("--package-root", type=Path, default=Path(__file__).resolve().parents[1])
	parser.add_argument("--split-seed", type=int, required=True)
	parser.add_argument("--lgcn-score-file", type=Path, required=True)
	parser.add_argument("--n-splits", type=int, default=5)
	parser.add_argument("--seed", type=int, default=0)
	args = parser.parse_args()

	root = args.package_root
	stack_mod = load_module("stack_rank_calibration", root / "code" / "stack_rank_calibration.py")
	lgcn_mod = load_module("train_val_lgcn_ensemble", root / "code" / "train_val_lgcn_ensemble.py")
	train_refs, val_pairs = lgcn_mod.make_notebook_style_split(root, args.split_seed, 0.9)
	pairs = val_pairs[["source", "target"]].to_numpy(np.int64)
	y = val_pairs["label"].to_numpy(np.int8)
	score_lgcn = np.load(args.lgcn_score_file).astype(np.float32)
	builder = stack_mod.ExplicitGraphFeatures(root, train_refs)
	out_dir = root / "validation_runs" / f"dynamic_seed{args.split_seed}" / "post95_ablation"
	out_dir.mkdir(parents=True, exist_ok=True)

	print("building baseline handcrafted/rank features")
	X_hand = builder.transform(pairs)
	X_rank = stack_mod.add_rank_features(pairs, score_lgcn)
	X_base = np.column_stack([X_rank, X_hand]).astype(np.float32)

	rows = []
	base_oof = fit_lgb_oof(X_base, y, args.seed, args.n_splits)
	f1, th, auc, precision, recall = best_f1(y, base_oof)
	rows.append({"stage": "baseline_stacking", "f1": f1, "threshold": th, "auc": auc, "precision": precision, "recall": recall, "n_features": X_base.shape[1]})
	base_pred = (base_oof >= th).astype(np.int8)
	error_analysis(y, base_oof, base_pred, pairs, X_hand, score_lgcn, X_rank[:, 3], out_dir)

	# Group threshold tuning on baseline OOF scores.
	author_bucket = bucket_series(X_hand[:, 0], "author_degree", [-np.inf, 1, 3, 8, 20, 50, np.inf])
	score_bucket = pd.qcut(score_lgcn, q=10, duplicates="drop")
	for name, group in [("group_threshold_author_degree", author_bucket), ("group_threshold_score_lgcn", score_bucket)]:
	gf1, gp, gr, thresholds, _ = group_threshold(y, base_oof, np.asarray(group))
	rows.append({"stage": name, "f1": gf1, "threshold": np.nan, "auc": auc, "precision": gp, "recall": gr, "n_features": X_base.shape[1]})
	pd.Series(thresholds).to_csv(out_dir / f"{name}_thresholds.csv")
	quota = author_quota_tuning(y, base_oof, pairs, X_hand[:, 0])
	rows.append({"stage": "author_quota_by_degree", "f1": quota["f1"], "threshold": quota["base_ratio"], "auc": np.nan, "precision": quota["precision"], "recall": quota["recall"], "n_features": X_base.shape[1]})

	print("adding negative-evidence features")
	X_neg = np.column_stack([X_base, negative_evidence_features(X_hand, score_lgcn)]).astype(np.float32)
	neg_oof = fit_lgb_oof(X_neg, y, args.seed + 11, args.n_splits)
	f1, th, auc, precision, recall = best_f1(y, neg_oof)
	rows.append({"stage": "negative_evidence_features", "f1": f1, "threshold": th, "auc": auc, "precision": precision, "recall": recall, "n_features": X_neg.shape[1]})

	print("adding top-k content similarity features")
	X_sim = np.column_stack([X_neg, topk_content_similarity(root, pairs, builder)]).astype(np.float32)
	sim_oof = fit_lgb_oof(X_sim, y, args.seed + 22, args.n_splits)
	f1, th, auc, precision, recall = best_f1(y, sim_oof)
	rows.append({"stage": "topk_similarity_features", "f1": f1, "threshold": th, "auc": auc, "precision": precision, "recall": recall, "n_features": X_sim.shape[1]})

	print("adding multi-LightGCN variant score features")
	X_var, names = load_lgcn_variant_scores(root, args.split_seed, y)
	(out_dir / "lgcn_variant_feature_names.txt").write_text("\n".join(names) + "\n")
	X_ens = np.column_stack([X_sim, X_var]).astype(np.float32)
	ens_oof = fit_lgb_oof(X_ens, y, args.seed + 33, args.n_splits)
	f1, th, auc, precision, recall = best_f1(y, ens_oof)
	rows.append({"stage": "ensemble_lgcn_score_features", "f1": f1, "threshold": th, "auc": auc, "precision": precision, "recall": recall, "n_features": X_ens.shape[1]})

	result = pd.DataFrame(rows).sort_values("f1", ascending=False)
	result.to_csv(out_dir / "ablation_table.csv", index=False)
	np.save(out_dir / "baseline_oof.npy", base_oof)
	np.save(out_dir / "negative_oof.npy", neg_oof)
	np.save(out_dir / "similarity_oof.npy", sim_oof)
	np.save(out_dir / "ensemble_lgcn_oof.npy", ens_oof)
	print("\nAblation table:")
	print(result.to_string(index=False))


	if __name__ == "__main__":
	main()