"""Richer feature.pkl content features for the post95 + BPR-MF stacker.""" from __future__ import annotations import argparse import importlib.util import pickle as pkl from pathlib import Path import lightgbm as lgb import numpy as np import pandas as pd from sklearn.metrics import precision_recall_curve, roc_auc_score from sklearn.model_selection import StratifiedKFold def load_module(name: str, path: Path): spec = importlib.util.spec_from_file_location(name, path) module = importlib.util.module_from_spec(spec) assert spec.loader is not None spec.loader.exec_module(module) return module def best_f1(y: np.ndarray, s: np.ndarray): p, r, t = precision_recall_curve(y, s) f = 2 * p * r / (p + r + 1e-12) i = int(np.argmax(f)) th = float(t[i]) if i < len(t) else 0.5 return float(f[i]), th, float(roc_auc_score(y, s)), float(p[i]), float(r[i]) def fit_lgb_oof(X: np.ndarray, y: np.ndarray, seed: int, n_splits: int, *, ranker_like: bool = False) -> np.ndarray: oof = np.zeros(len(y), dtype=np.float32) skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=seed) for fold, (tr, va) in enumerate(skf.split(X, y), start=1): clf = lgb.LGBMClassifier( n_estimators=1200 if not ranker_like else 800, learning_rate=0.025 if not ranker_like else 0.03, num_leaves=31, subsample=0.9, colsample_bytree=0.9, reg_lambda=5.0, min_child_samples=80, objective="binary", verbose=-1, random_state=seed + fold, ) clf.fit(X[tr], y[tr]) oof[va] = clf.predict_proba(X[va])[:, 1] return oof def content_rich_features(root: Path, pairs: np.ndarray, builder) -> np.ndarray: cache = root / "validation_runs" / "feature_cache" cache.mkdir(parents=True, exist_ok=True) path = cache / f"content_rich_{len(pairs)}_{int(pairs[:,0].sum())}_{int(pairs[:,1].sum())}.npy" if path.exists(): return np.load(path) with (root / "data_and_docs" / "feature.pkl").open("rb") as f: feat = pkl.load(f).numpy().astype(np.float32) feat /= np.linalg.norm(feat, axis=1, keepdims=True) + 1e-8 n_authors = builder.num_authors dim = feat.shape[1] mean = np.zeros((n_authors, dim), dtype=np.float32) mean_normed = np.zeros((n_authors, dim), dtype=np.float32) std_scalar = np.zeros(n_authors, dtype=np.float32) mean_pair_cos = np.zeros(n_authors, dtype=np.float32) hist_count = np.zeros(n_authors, dtype=np.float32) for a in range(n_authors): hist = np.asarray(list(builder.author_papers[a]), dtype=np.int64) hist_count[a] = len(hist) if len(hist) == 0: continue H = feat[hist] m = H.mean(axis=0) mean[a] = m mean_normed[a] = m / (np.linalg.norm(m) + 1e-8) dist = np.sum((H - m) ** 2, axis=1) std_scalar[a] = float(np.sqrt(dist.mean())) if len(hist) > 1: sims = H @ H.T mean_pair_cos[a] = float((sims.sum() - len(hist)) / (len(hist) * (len(hist) - 1))) else: mean_pair_cos[a] = 1.0 out = np.zeros((len(pairs), 18), dtype=np.float32) order = np.argsort(pairs[:, 0], kind="mergesort") authors = pairs[order, 0] boundaries = np.r_[0, np.flatnonzero(authors[1:] != authors[:-1]) + 1, len(order)] for lo, hi in zip(boundaries[:-1], boundaries[1:]): idx = order[lo:hi] a = int(pairs[idx[0], 0]) cand = pairs[idx, 1].astype(np.int64) C = feat[cand] center_cos = C @ mean_normed[a] center_l2 = np.sqrt(np.sum((C - mean[a]) ** 2, axis=1)) out[idx, 0] = center_cos out[idx, 1] = center_l2 out[idx, 2] = hist_count[a] out[idx, 3] = np.log1p(hist_count[a]) out[idx, 4] = std_scalar[a] out[idx, 5] = mean_pair_cos[a] out[idx, 6] = center_cos / (std_scalar[a] + 1e-3) hist = np.asarray(list(builder.author_papers[a]), dtype=np.int64) if len(hist) == 0: continue sims = C @ feat[hist].T out[idx, 7] = sims.max(axis=1) out[idx, 8] = sims.mean(axis=1) out[idx, 9] = sims.std(axis=1) out[idx, 10] = np.median(sims, axis=1) for col, k in [(11, 3), (12, 5), (13, 10)]: kk = min(k, sims.shape[1]) out[idx, col] = np.partition(sims, -kk, axis=1)[:, -kk:].mean(axis=1) out[idx, 14] = (sims > 0.5).mean(axis=1) out[idx, 15] = (sims > 0.7).mean(axis=1) # Percentile of candidate center similarity among this author's test/val candidates. vals = center_cos local_order = np.argsort(vals, kind="mergesort") pct = np.linspace(0, 1, len(vals), dtype=np.float32) if len(vals) > 1 else np.array([1.0], dtype=np.float32) tmp = np.zeros(len(vals), dtype=np.float32) tmp[local_order] = pct out[idx, 16] = tmp out[idx, 17] = 1.0 - tmp np.save(path, out) return out def main() -> None: parser = argparse.ArgumentParser() parser.add_argument("--package-root", type=Path, default=Path(__file__).resolve().parents[1]) parser.add_argument("--split-seed", type=int, default=202) parser.add_argument("--main-val-score-file", type=Path, required=True) parser.add_argument("--seed", type=int, default=202) parser.add_argument("--n-splits", type=int, default=5) args = parser.parse_args() root = args.package_root stack = load_module("stack", root / "code" / "stack_rank_calibration.py") lgcn = load_module("lgcn", root / "code" / "train_val_lgcn_ensemble.py") post = load_module("post", root / "code" / "post95_ablation.py") gen = load_module("gen", root / "code" / "generate_post95_submission.py") extra = load_module("extra", root / "code" / "extra_score_sources_ablation.py") train_refs, val_pairs = lgcn.make_notebook_style_split(root, args.split_seed, 0.9) pairs = val_pairs[["source", "target"]].to_numpy(np.int64) y = val_pairs["label"].to_numpy(np.int8) main = np.load(args.main_val_score_file).astype(np.float32) out_dir = root / "validation_runs" / f"dynamic_seed{args.split_seed}" / "content_rich" out_dir.mkdir(parents=True, exist_ok=True) builder = stack.ExplicitGraphFeatures(root, train_refs) X_hand = builder.transform(pairs) X_base = np.column_stack( [ stack.add_rank_features(pairs, main), X_hand, post.negative_evidence_features(X_hand, main), gen.topk_content_similarity_fast(root, pairs, builder), ] ).astype(np.float32) selected = [Path(x.strip()) for x in (root / "validation_runs" / f"dynamic_seed{args.split_seed}" / "post95_submission" / "selected_variant_val_scores.txt").read_text().splitlines() if x.strip()] X_base = np.column_stack([X_base, gen.variant_feature_matrix(post, [np.load(p).astype(np.float32) for p in selected])]).astype(np.float32) content = extra.content_mean_score(root, pairs, builder) mf = np.load(root / "validation_runs" / f"dynamic_seed{args.split_seed}" / "extra_score_sources" / f"val_mf_bpr_s{args.seed}_d256.npy").astype(np.float32) Xc, _ = extra.score_to_features(content, "content_mean_cos", pairs) Xm, _ = extra.score_to_features(mf, "mf_bpr", pairs) X_cm = np.column_stack([X_base, Xc, Xm]).astype(np.float32) rows = [] print("baseline content+mf stack") oof = fit_lgb_oof(X_cm, y, args.seed, args.n_splits) f1, th, auc, p, r = best_f1(y, oof) rows.append({"stage": "content_mf_baseline", "f1": f1, "threshold": th, "auc": auc, "precision": p, "recall": r, "n_features": X_cm.shape[1]}) np.save(out_dir / "content_mf_baseline_oof.npy", oof) print("rich content feature-only model") X_rich = content_rich_features(root, pairs, builder) rich_oof = fit_lgb_oof(X_rich, y, args.seed + 7, args.n_splits, ranker_like=True) f1, th, auc, p, r = best_f1(y, rich_oof) rows.append({"stage": "rich_content_only_lgb", "f1": f1, "threshold": th, "auc": auc, "precision": p, "recall": r, "n_features": X_rich.shape[1]}) np.save(out_dir / "rich_content_only_oof.npy", rich_oof) print("stack + rich content raw features") X_all = np.column_stack([X_cm, X_rich]).astype(np.float32) oof = fit_lgb_oof(X_all, y, args.seed + 11, args.n_splits) f1, th, auc, p, r = best_f1(y, oof) rows.append({"stage": "+rich_content_features", "f1": f1, "threshold": th, "auc": auc, "precision": p, "recall": r, "n_features": X_all.shape[1]}) np.save(out_dir / "rich_content_stack_oof.npy", oof) print("stack + rich content model score") X_score, _ = extra.score_to_features(rich_oof, "rich_content_lgb_oof", pairs) X_all_score = np.column_stack([X_all, X_score]).astype(np.float32) oof = fit_lgb_oof(X_all_score, y, args.seed + 13, args.n_splits) f1, th, auc, p, r = best_f1(y, oof) rows.append({"stage": "+rich_content_model_score", "f1": f1, "threshold": th, "auc": auc, "precision": p, "recall": r, "n_features": X_all_score.shape[1]}) np.save(out_dir / "rich_content_model_score_stack_oof.npy", oof) result = pd.DataFrame(rows).sort_values("f1", ascending=False) result.to_csv(out_dir / "content_rich_ablation.csv", index=False) print(result.to_string(index=False)) if __name__ == "__main__": main()