"""Systematic DeepWalk/Node2Vec ablations on top of the current stacker.""" from __future__ import annotations import argparse import importlib.util from dataclasses import dataclass from pathlib import Path import lightgbm as lgb import networkx as nx import numpy as np import pandas as pd from gensim.models import Word2Vec from node2vec import Node2Vec from sklearn.metrics import precision_recall_curve, roc_auc_score from sklearn.model_selection import StratifiedKFold def load_module(name: str, path: Path): spec = importlib.util.spec_from_file_location(name, path) module = importlib.util.module_from_spec(spec) assert spec.loader is not None spec.loader.exec_module(module) return module def read_txt(path: Path) -> list[list[int]]: return [list(map(int, line.strip().split())) for line in path.open()] def best_f1(y: np.ndarray, s: np.ndarray): p, r, t = precision_recall_curve(y, s) f = 2 * p * r / (p + r + 1e-12) i = int(np.argmax(f)) th = float(t[i]) if i < len(t) else 0.5 return float(f[i]), th, float(roc_auc_score(y, s)), float(p[i]), float(r[i]) def rank01(x: np.ndarray) -> np.ndarray: order = np.argsort(x, kind="mergesort") out = np.empty(len(x), dtype=np.float32) out[order] = np.linspace(0, 1, len(x), dtype=np.float32) return out def zscore(x: np.ndarray) -> np.ndarray: return ((x - x.mean()) / (x.std() + 1e-8)).astype(np.float32) def fit_lgb_oof(X: np.ndarray, y: np.ndarray, seed: int, n_splits: int) -> np.ndarray: oof = np.zeros(len(y), dtype=np.float32) skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=seed) for fold, (tr, va) in enumerate(skf.split(X, y), start=1): clf = lgb.LGBMClassifier( n_estimators=1200, learning_rate=0.025, num_leaves=31, subsample=0.9, colsample_bytree=0.9, reg_lambda=5.0, min_child_samples=80, objective="binary", n_jobs=4, verbose=-1, random_state=seed + fold, ) clf.fit(X[tr], y[tr]) oof[va] = clf.predict_proba(X[va])[:, 1] return oof @dataclass(frozen=True) class RWConfig: version_name: str graph_type: str method: str dim: int walk_length: int num_walks: int window: int p: float | None = None q: float | None = None seed: int = 202 def small_configs() -> list[RWConfig]: return [ RWConfig("dw_base_d128_l40_w10_win10", "full", "DeepWalk", 128, 40, 10, 10), RWConfig("dw_long_d128_l80_w10_win10", "full", "DeepWalk", 128, 80, 10, 10), RWConfig("dw_highdim_d256_l40_w10_win10", "full", "DeepWalk", 256, 40, 10, 10), RWConfig("n2v_bfs_d128_l40_w10_win10_p1_q2", "full", "Node2Vec", 128, 40, 10, 10, 1.0, 2.0), RWConfig("n2v_dfs_d128_l40_w10_win10_p1_q0.5", "full", "Node2Vec", 128, 40, 10, 10, 1.0, 0.5), RWConfig("n2v_bal_d128_l40_w10_win10_p1_q1", "full", "Node2Vec", 128, 40, 10, 10, 1.0, 1.0), ] def graph_configs() -> list[RWConfig]: return [ RWConfig("dw_graph_ap_only", "ap_only", "DeepWalk", 128, 40, 10, 10), RWConfig("dw_graph_ap_aa", "ap_aa", "DeepWalk", 128, 40, 10, 10), RWConfig("dw_graph_ap_pp", "ap_pp", "DeepWalk", 128, 40, 10, 10), RWConfig("dw_graph_pp_author_mean", "pp_only_author_mean", "DeepWalk", 128, 40, 10, 10), ] def extra_configs() -> list[RWConfig]: return [ RWConfig("dw_seed42_d128_l40_w10_win10", "full", "DeepWalk", 128, 40, 10, 10, seed=42), RWConfig("dw_seed3407_d128_l40_w10_win10", "full", "DeepWalk", 128, 40, 10, 10, seed=3407), RWConfig("dw_d64_l40_w10_win10", "full", "DeepWalk", 64, 40, 10, 10), RWConfig("dw_d256_l80_w10_win10", "full", "DeepWalk", 256, 80, 10, 10), RWConfig("n2v_p0.5_q1_d128_l40_w10_win10", "full", "Node2Vec", 128, 40, 10, 10, 0.5, 1.0), RWConfig("n2v_p2_q1_d128_l40_w10_win10", "full", "Node2Vec", 128, 40, 10, 10, 2.0, 1.0), ] def build_graph(root: Path, train_refs: pd.DataFrame, graph_type: str) -> nx.Graph: data_dir = root / "data_and_docs" G = nx.Graph() if graph_type == "pp_only_author_mean": G.add_nodes_from([f"p{p}" for p in range(79937)]) else: G.add_nodes_from([f"a{a}" for a in range(6611)]) G.add_nodes_from([f"p{p}" for p in range(79937)]) if graph_type in {"full", "ap_only", "ap_aa", "ap_pp"}: for a, p in train_refs[["source", "target"]].to_numpy(np.int64): G.add_edge(f"a{int(a)}", f"p{int(p)}", weight=3.0) if graph_type in {"full", "ap_aa"}: for a, b in read_txt(data_dir / "author_file_ann.txt"): G.add_edge(f"a{a}", f"a{b}", weight=1.0) if graph_type in {"full", "ap_pp", "pp_only_author_mean"}: for s, t in read_txt(data_dir / "paper_file_ann.txt"): G.add_edge(f"p{s}", f"p{t}", weight=1.0) return G def deepwalk_walks(G: nx.Graph, walk_length: int, num_walks: int, seed: int) -> list[list[str]]: rng = np.random.default_rng(seed) nodes = np.array(list(G.nodes()), dtype=object) neigh = {n: list(G.neighbors(n)) for n in G.nodes()} walks: list[list[str]] = [] for _ in range(num_walks): order = nodes.copy() rng.shuffle(order) for start in order: walk = [start] cur = start for _step in range(walk_length - 1): ns = neigh[cur] if not ns: break cur = ns[int(rng.integers(0, len(ns)))] walk.append(cur) walks.append(walk) return walks def train_model(G: nx.Graph, cfg: RWConfig, out_dir: Path, workers: int) -> Word2Vec: model_path = out_dir / f"{cfg.version_name}.model" if model_path.exists(): return Word2Vec.load(str(model_path)) if cfg.method == "DeepWalk": walks = deepwalk_walks(G, cfg.walk_length, cfg.num_walks, cfg.seed) model = Word2Vec( sentences=walks, vector_size=cfg.dim, window=cfg.window, min_count=0, sg=1, negative=5, epochs=3, workers=workers, seed=cfg.seed, ) else: n2v = Node2Vec( G, dimensions=cfg.dim, walk_length=cfg.walk_length, num_walks=cfg.num_walks, p=float(cfg.p), q=float(cfg.q), workers=workers, seed=cfg.seed, quiet=False, ) model = n2v.fit(window=cfg.window, min_count=0, batch_words=4096, seed=cfg.seed, epochs=3) model.save(str(model_path)) return model def embedding_arrays(model: Word2Vec, train_refs: pd.DataFrame | None = None) -> tuple[np.ndarray, np.ndarray]: dim = model.vector_size avec = np.zeros((6611, dim), dtype=np.float32) pvec = np.zeros((79937, dim), dtype=np.float32) for p in range(79937): key = f"p{p}" if key in model.wv: pvec[p] = model.wv[key] for a in range(6611): key = f"a{a}" if key in model.wv: avec[a] = model.wv[key] if train_refs is not None and not np.any(np.abs(avec).sum(axis=1) > 0): author_papers: list[list[int]] = [[] for _ in range(6611)] for a, p in train_refs[["source", "target"]].to_numpy(np.int64): author_papers[int(a)].append(int(p)) for a, hist in enumerate(author_papers): if hist: avec[a] = pvec[np.asarray(hist, dtype=np.int64)].mean(axis=0) return avec, pvec def pair_feature_block( model: Word2Vec, pairs: np.ndarray, cfg: RWConfig, root: Path, split_seed: int, train_refs: pd.DataFrame, ) -> tuple[np.ndarray, list[str]]: cache_dir = root / "validation_runs" / f"dynamic_seed{split_seed}" / "randomwalk_systematic" / "pair_features" cache_dir.mkdir(parents=True, exist_ok=True) key = f"{cfg.version_name}_{len(pairs)}_{int(pairs[:,0].sum())}_{int(pairs[:,1].sum())}.npz" path = cache_dir / key names = [ "dot", "cos", "hadamard_mean", "absdiff_mean", "l2_distance", "dot_global_rank", "cos_global_rank", "dot_author_rank", "cos_author_rank", "dot_author_pct", "cos_author_pct", ] names = [f"{cfg.version_name}_{n}" for n in names] if path.exists(): return np.load(path)["X"].astype(np.float32), names avec, pvec = embedding_arrays(model, train_refs if cfg.graph_type == "pp_only_author_mean" else None) A = avec[pairs[:, 0]] P = pvec[pairs[:, 1]] dot = np.sum(A * P, axis=1).astype(np.float32) cos = (dot / ((np.linalg.norm(A, axis=1) + 1e-8) * (np.linalg.norm(P, axis=1) + 1e-8))).astype(np.float32) had = np.mean(A * P, axis=1).astype(np.float32) absdiff = np.mean(np.abs(A - P), axis=1).astype(np.float32) l2 = np.sqrt(np.sum((A - P) ** 2, axis=1)).astype(np.float32) dot_ar = np.zeros(len(pairs), dtype=np.float32) cos_ar = np.zeros(len(pairs), dtype=np.float32) dot_pct = np.zeros(len(pairs), dtype=np.float32) cos_pct = np.zeros(len(pairs), dtype=np.float32) df = pd.DataFrame({"idx": np.arange(len(pairs)), "author": pairs[:, 0], "dot": dot, "cos": cos}) for _, g in df.groupby("author", sort=False): idx = g["idx"].to_numpy() n = len(idx) vals = np.linspace(0, 1, n, dtype=np.float32) if n > 1 else np.array([1.0], dtype=np.float32) od = np.argsort(g["dot"].to_numpy(), kind="mergesort") oc = np.argsort(g["cos"].to_numpy(), kind="mergesort") dot_ar[idx[od]] = np.arange(n, dtype=np.float32) cos_ar[idx[oc]] = np.arange(n, dtype=np.float32) dot_pct[idx[od]] = vals cos_pct[idx[oc]] = vals X = np.column_stack([dot, cos, had, absdiff, l2, rank01(dot), rank01(cos), dot_ar, cos_ar, dot_pct, cos_pct]).astype(np.float32) np.savez_compressed(path, X=X) return X, names def build_base_features(root: Path, split_seed: int, main_score_file: Path): stack = load_module("stack", root / "code" / "stack_rank_calibration.py") lgcn = load_module("lgcn", root / "code" / "train_val_lgcn_ensemble.py") post = load_module("post", root / "code" / "post95_ablation.py") gen = load_module("gen", root / "code" / "generate_post95_submission.py") extra = load_module("extra", root / "code" / "extra_score_sources_ablation.py") train_refs, val_pairs = lgcn.make_notebook_style_split(root, split_seed, 0.9) pairs = val_pairs[["source", "target"]].to_numpy(np.int64) y = val_pairs["label"].to_numpy(np.int8) main = np.load(main_score_file).astype(np.float32) builder = stack.ExplicitGraphFeatures(root, train_refs) Xh = builder.transform(pairs) X = np.column_stack( [ stack.add_rank_features(pairs, main), Xh, post.negative_evidence_features(Xh, main), gen.topk_content_similarity_fast(root, pairs, builder), ] ).astype(np.float32) selected = [Path(x.strip()) for x in (root / "validation_runs" / f"dynamic_seed{split_seed}" / "post95_submission" / "selected_variant_val_scores.txt").read_text().splitlines() if x.strip()] X = np.column_stack([X, gen.variant_feature_matrix(post, [np.load(p).astype(np.float32) for p in selected])]).astype(np.float32) content = extra.content_mean_score(root, pairs, builder) mf = np.load(root / "validation_runs" / f"dynamic_seed{split_seed}" / "extra_score_sources" / "val_mf_bpr_s202_d256.npy").astype(np.float32) Xc, _ = extra.score_to_features(content, "content_mean_cos", pairs) Xm, _ = extra.score_to_features(mf, "mf_bpr", pairs) X = np.column_stack([X, Xc, Xm]).astype(np.float32) return train_refs, pairs, y, X def train_full_predict(X: np.ndarray, y: np.ndarray, X_test: np.ndarray, seed: int): clf = lgb.LGBMClassifier( n_estimators=1200, learning_rate=0.025, num_leaves=31, subsample=0.9, colsample_bytree=0.9, reg_lambda=5.0, min_child_samples=80, objective="binary", verbose=-1, random_state=seed, ) clf.fit(X, y) return clf.predict_proba(X_test)[:, 1].astype(np.float32), clf def main() -> None: parser = argparse.ArgumentParser() parser.add_argument("--package-root", type=Path, default=Path(__file__).resolve().parents[1]) parser.add_argument("--split-seed", type=int, default=202) parser.add_argument("--main-val-score-file", type=Path, required=True) parser.add_argument("--workers", type=int, default=8) parser.add_argument("--seed", type=int, default=202) parser.add_argument("--n-splits", type=int, default=5) parser.add_argument("--mode", choices=["small", "graph"], default="small") args = parser.parse_args() root = args.package_root out_dir = root / "validation_runs" / f"dynamic_seed{args.split_seed}" / "randomwalk_systematic" model_dir = out_dir / "models" out_dir.mkdir(parents=True, exist_ok=True) model_dir.mkdir(parents=True, exist_ok=True) train_refs, pairs, y, X_base = build_base_features(root, args.split_seed, args.main_val_score_file) configs = small_configs() if args.mode == "small" else graph_configs() current_best = root / "validation_runs" / f"dynamic_seed{args.split_seed}" / "node2vec_deepwalk_submission" / "submission_content_mf_deepwalk_node2vec_lgb_th0.480000.csv" current_pred = pd.read_csv(current_best)["Predicted"].to_numpy(np.int8) if current_best.exists() else None known = np.load(root / "cached_scores" / "test_known_mask.npy").astype(bool) test_pairs = np.array(read_txt(root / "data_and_docs" / "bipartite_test_ann.txt"), dtype=np.int64) # Base test matrix is expensive to rebuild; reuse the previous final score path for changed-pred comparisons only. rows = [] feature_blocks: list[np.ndarray] = [] feature_names: list[list[str]] = [] for cfg in configs: print(f"\n=== {cfg.version_name} ===") G = build_graph(root, train_refs, cfg.graph_type) print(f"graph_type={cfg.graph_type} nodes={G.number_of_nodes()} edges={G.number_of_edges()}") model = train_model(G, cfg, model_dir, args.workers) block, names = pair_feature_block(model, pairs, cfg, root, args.split_seed, train_refs) X = np.column_stack([X_base, block]).astype(np.float32) oof = fit_lgb_oof(X, y, args.seed + len(rows) * 13, args.n_splits) f1, th, auc, p, r = best_f1(y, oof) np.save(out_dir / f"{cfg.version_name}_oof.npy", oof) # Full test generation is delegated to the ensemble script for selected versions; # single-version submission paths are recorded as intended paths. sub_path = out_dir / "single_submissions" / f"submission_{cfg.version_name}_th0.480000.csv" rows.append( { "version_name": cfg.version_name, "graph_type": cfg.graph_type, "method": cfg.method, "dim": cfg.dim, "walk_length": cfg.walk_length, "num_walks": cfg.num_walks, "window": cfg.window, "p": cfg.p, "q": cfg.q, "validation_F1": f1, "threshold": th, "auc": auc, "precision": p, "recall": r, "predicted_positive_ratio": np.nan, "public_submission_path": str(sub_path), "changed_predictions_vs_current_best": np.nan, "rw_feature_importance_best_rank": np.nan, } ) feature_blocks.append(block) feature_names.append(names) result = pd.DataFrame(rows).sort_values("validation_F1", ascending=False) result.to_csv(out_dir / f"{args.mode}_ablation_table.csv", index=False) print(result.to_string(index=False)) # Ensemble top 5 by validation F1 using aggregate random-walk features. result = pd.DataFrame(rows).sort_values("validation_F1", ascending=False) top_idx = result.index[: min(5, len(result))].to_list() blocks = [feature_blocks[i] for i in top_idx] cos_cols = [b[:, 1] for b in blocks] dot_cols = [b[:, 0] for b in blocks] ar_cols = [b[:, 10] for b in blocks] # cosine author percentile cos_stack = np.vstack(cos_cols) dot_stack = np.vstack(dot_cols) ar_stack = np.vstack(ar_cols) agree = (ar_stack >= 0.5).sum(axis=0).astype(np.float32) agg = np.column_stack( [ cos_stack.mean(axis=0), cos_stack.std(axis=0), cos_stack.max(axis=0), cos_stack.min(axis=0), dot_stack.mean(axis=0), dot_stack.std(axis=0), ar_stack.mean(axis=0), ar_stack.std(axis=0), ar_stack.max(axis=0), agree, ] ).astype(np.float32) X_ens = np.column_stack([X_base, *blocks, agg]).astype(np.float32) oof = fit_lgb_oof(X_ens, y, args.seed + 999, args.n_splits) f1, th, auc, p, r = best_f1(y, oof) np.save(out_dir / f"{args.mode}_ensemble_oof.npy", oof) ens_row = { "version_name": f"{args.mode}_top{len(blocks)}_rw_ensemble", "graph_type": "mixed", "method": "RWEnsemble", "dim": np.nan, "walk_length": np.nan, "num_walks": np.nan, "window": np.nan, "p": np.nan, "q": np.nan, "validation_F1": f1, "threshold": th, "auc": auc, "precision": p, "recall": r, "predicted_positive_ratio": np.nan, "public_submission_path": str(root / "validation_runs" / f"dynamic_seed{args.split_seed}" / "randomwalk_ensemble_submission"), "changed_predictions_vs_current_best": np.nan, "rw_feature_importance_best_rank": np.nan, } result = pd.concat([result, pd.DataFrame([ens_row])], ignore_index=True).sort_values("validation_F1", ascending=False) result.to_csv(out_dir / f"{args.mode}_ablation_table.csv", index=False) print("\nFinal table:") print(result.to_string(index=False)) if __name__ == "__main__": main()