| """Systematic DeepWalk/Node2Vec ablations on top of the current stacker.""" |
|
|
| from __future__ import annotations |
|
|
| import argparse |
| import importlib.util |
| from dataclasses import dataclass |
| from pathlib import Path |
|
|
| import lightgbm as lgb |
| import networkx as nx |
| import numpy as np |
| import pandas as pd |
| from gensim.models import Word2Vec |
| from node2vec import Node2Vec |
| from sklearn.metrics import precision_recall_curve, roc_auc_score |
| from sklearn.model_selection import StratifiedKFold |
|
|
|
|
| def load_module(name: str, path: Path): |
| spec = importlib.util.spec_from_file_location(name, path) |
| module = importlib.util.module_from_spec(spec) |
| assert spec.loader is not None |
| spec.loader.exec_module(module) |
| return module |
|
|
|
|
| def read_txt(path: Path) -> list[list[int]]: |
| return [list(map(int, line.strip().split())) for line in path.open()] |
|
|
|
|
| def best_f1(y: np.ndarray, s: np.ndarray): |
| p, r, t = precision_recall_curve(y, s) |
| f = 2 * p * r / (p + r + 1e-12) |
| i = int(np.argmax(f)) |
| th = float(t[i]) if i < len(t) else 0.5 |
| return float(f[i]), th, float(roc_auc_score(y, s)), float(p[i]), float(r[i]) |
|
|
|
|
| def rank01(x: np.ndarray) -> np.ndarray: |
| order = np.argsort(x, kind="mergesort") |
| out = np.empty(len(x), dtype=np.float32) |
| out[order] = np.linspace(0, 1, len(x), dtype=np.float32) |
| return out |
|
|
|
|
| def zscore(x: np.ndarray) -> np.ndarray: |
| return ((x - x.mean()) / (x.std() + 1e-8)).astype(np.float32) |
|
|
|
|
| def fit_lgb_oof(X: np.ndarray, y: np.ndarray, seed: int, n_splits: int) -> np.ndarray: |
| oof = np.zeros(len(y), dtype=np.float32) |
| skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=seed) |
| for fold, (tr, va) in enumerate(skf.split(X, y), start=1): |
| clf = lgb.LGBMClassifier( |
| n_estimators=1200, |
| learning_rate=0.025, |
| num_leaves=31, |
| subsample=0.9, |
| colsample_bytree=0.9, |
| reg_lambda=5.0, |
| min_child_samples=80, |
| objective="binary", |
| n_jobs=4, |
| verbose=-1, |
| random_state=seed + fold, |
| ) |
| clf.fit(X[tr], y[tr]) |
| oof[va] = clf.predict_proba(X[va])[:, 1] |
| return oof |
|
|
|
|
| @dataclass(frozen=True) |
| class RWConfig: |
| version_name: str |
| graph_type: str |
| method: str |
| dim: int |
| walk_length: int |
| num_walks: int |
| window: int |
| p: float | None = None |
| q: float | None = None |
| seed: int = 202 |
|
|
|
|
| def small_configs() -> list[RWConfig]: |
| return [ |
| RWConfig("dw_base_d128_l40_w10_win10", "full", "DeepWalk", 128, 40, 10, 10), |
| RWConfig("dw_long_d128_l80_w10_win10", "full", "DeepWalk", 128, 80, 10, 10), |
| RWConfig("dw_highdim_d256_l40_w10_win10", "full", "DeepWalk", 256, 40, 10, 10), |
| RWConfig("n2v_bfs_d128_l40_w10_win10_p1_q2", "full", "Node2Vec", 128, 40, 10, 10, 1.0, 2.0), |
| RWConfig("n2v_dfs_d128_l40_w10_win10_p1_q0.5", "full", "Node2Vec", 128, 40, 10, 10, 1.0, 0.5), |
| RWConfig("n2v_bal_d128_l40_w10_win10_p1_q1", "full", "Node2Vec", 128, 40, 10, 10, 1.0, 1.0), |
| ] |
|
|
|
|
| def graph_configs() -> list[RWConfig]: |
| return [ |
| RWConfig("dw_graph_ap_only", "ap_only", "DeepWalk", 128, 40, 10, 10), |
| RWConfig("dw_graph_ap_aa", "ap_aa", "DeepWalk", 128, 40, 10, 10), |
| RWConfig("dw_graph_ap_pp", "ap_pp", "DeepWalk", 128, 40, 10, 10), |
| RWConfig("dw_graph_pp_author_mean", "pp_only_author_mean", "DeepWalk", 128, 40, 10, 10), |
| ] |
|
|
|
|
| def extra_configs() -> list[RWConfig]: |
| return [ |
| RWConfig("dw_seed42_d128_l40_w10_win10", "full", "DeepWalk", 128, 40, 10, 10, seed=42), |
| RWConfig("dw_seed3407_d128_l40_w10_win10", "full", "DeepWalk", 128, 40, 10, 10, seed=3407), |
| RWConfig("dw_d64_l40_w10_win10", "full", "DeepWalk", 64, 40, 10, 10), |
| RWConfig("dw_d256_l80_w10_win10", "full", "DeepWalk", 256, 80, 10, 10), |
| RWConfig("n2v_p0.5_q1_d128_l40_w10_win10", "full", "Node2Vec", 128, 40, 10, 10, 0.5, 1.0), |
| RWConfig("n2v_p2_q1_d128_l40_w10_win10", "full", "Node2Vec", 128, 40, 10, 10, 2.0, 1.0), |
| ] |
|
|
|
|
| def build_graph(root: Path, train_refs: pd.DataFrame, graph_type: str) -> nx.Graph: |
| data_dir = root / "data_and_docs" |
| G = nx.Graph() |
| if graph_type == "pp_only_author_mean": |
| G.add_nodes_from([f"p{p}" for p in range(79937)]) |
| else: |
| G.add_nodes_from([f"a{a}" for a in range(6611)]) |
| G.add_nodes_from([f"p{p}" for p in range(79937)]) |
|
|
| if graph_type in {"full", "ap_only", "ap_aa", "ap_pp"}: |
| for a, p in train_refs[["source", "target"]].to_numpy(np.int64): |
| G.add_edge(f"a{int(a)}", f"p{int(p)}", weight=3.0) |
| if graph_type in {"full", "ap_aa"}: |
| for a, b in read_txt(data_dir / "author_file_ann.txt"): |
| G.add_edge(f"a{a}", f"a{b}", weight=1.0) |
| if graph_type in {"full", "ap_pp", "pp_only_author_mean"}: |
| for s, t in read_txt(data_dir / "paper_file_ann.txt"): |
| G.add_edge(f"p{s}", f"p{t}", weight=1.0) |
| return G |
|
|
|
|
| def deepwalk_walks(G: nx.Graph, walk_length: int, num_walks: int, seed: int) -> list[list[str]]: |
| rng = np.random.default_rng(seed) |
| nodes = np.array(list(G.nodes()), dtype=object) |
| neigh = {n: list(G.neighbors(n)) for n in G.nodes()} |
| walks: list[list[str]] = [] |
| for _ in range(num_walks): |
| order = nodes.copy() |
| rng.shuffle(order) |
| for start in order: |
| walk = [start] |
| cur = start |
| for _step in range(walk_length - 1): |
| ns = neigh[cur] |
| if not ns: |
| break |
| cur = ns[int(rng.integers(0, len(ns)))] |
| walk.append(cur) |
| walks.append(walk) |
| return walks |
|
|
|
|
| def train_model(G: nx.Graph, cfg: RWConfig, out_dir: Path, workers: int) -> Word2Vec: |
| model_path = out_dir / f"{cfg.version_name}.model" |
| if model_path.exists(): |
| return Word2Vec.load(str(model_path)) |
| if cfg.method == "DeepWalk": |
| walks = deepwalk_walks(G, cfg.walk_length, cfg.num_walks, cfg.seed) |
| model = Word2Vec( |
| sentences=walks, |
| vector_size=cfg.dim, |
| window=cfg.window, |
| min_count=0, |
| sg=1, |
| negative=5, |
| epochs=3, |
| workers=workers, |
| seed=cfg.seed, |
| ) |
| else: |
| n2v = Node2Vec( |
| G, |
| dimensions=cfg.dim, |
| walk_length=cfg.walk_length, |
| num_walks=cfg.num_walks, |
| p=float(cfg.p), |
| q=float(cfg.q), |
| workers=workers, |
| seed=cfg.seed, |
| quiet=False, |
| ) |
| model = n2v.fit(window=cfg.window, min_count=0, batch_words=4096, seed=cfg.seed, epochs=3) |
| model.save(str(model_path)) |
| return model |
|
|
|
|
| def embedding_arrays(model: Word2Vec, train_refs: pd.DataFrame | None = None) -> tuple[np.ndarray, np.ndarray]: |
| dim = model.vector_size |
| avec = np.zeros((6611, dim), dtype=np.float32) |
| pvec = np.zeros((79937, dim), dtype=np.float32) |
| for p in range(79937): |
| key = f"p{p}" |
| if key in model.wv: |
| pvec[p] = model.wv[key] |
| for a in range(6611): |
| key = f"a{a}" |
| if key in model.wv: |
| avec[a] = model.wv[key] |
| if train_refs is not None and not np.any(np.abs(avec).sum(axis=1) > 0): |
| author_papers: list[list[int]] = [[] for _ in range(6611)] |
| for a, p in train_refs[["source", "target"]].to_numpy(np.int64): |
| author_papers[int(a)].append(int(p)) |
| for a, hist in enumerate(author_papers): |
| if hist: |
| avec[a] = pvec[np.asarray(hist, dtype=np.int64)].mean(axis=0) |
| return avec, pvec |
|
|
|
|
| def pair_feature_block( |
| model: Word2Vec, |
| pairs: np.ndarray, |
| cfg: RWConfig, |
| root: Path, |
| split_seed: int, |
| train_refs: pd.DataFrame, |
| ) -> tuple[np.ndarray, list[str]]: |
| cache_dir = root / "validation_runs" / f"dynamic_seed{split_seed}" / "randomwalk_systematic" / "pair_features" |
| cache_dir.mkdir(parents=True, exist_ok=True) |
| key = f"{cfg.version_name}_{len(pairs)}_{int(pairs[:,0].sum())}_{int(pairs[:,1].sum())}.npz" |
| path = cache_dir / key |
| names = [ |
| "dot", |
| "cos", |
| "hadamard_mean", |
| "absdiff_mean", |
| "l2_distance", |
| "dot_global_rank", |
| "cos_global_rank", |
| "dot_author_rank", |
| "cos_author_rank", |
| "dot_author_pct", |
| "cos_author_pct", |
| ] |
| names = [f"{cfg.version_name}_{n}" for n in names] |
| if path.exists(): |
| return np.load(path)["X"].astype(np.float32), names |
|
|
| avec, pvec = embedding_arrays(model, train_refs if cfg.graph_type == "pp_only_author_mean" else None) |
| A = avec[pairs[:, 0]] |
| P = pvec[pairs[:, 1]] |
| dot = np.sum(A * P, axis=1).astype(np.float32) |
| cos = (dot / ((np.linalg.norm(A, axis=1) + 1e-8) * (np.linalg.norm(P, axis=1) + 1e-8))).astype(np.float32) |
| had = np.mean(A * P, axis=1).astype(np.float32) |
| absdiff = np.mean(np.abs(A - P), axis=1).astype(np.float32) |
| l2 = np.sqrt(np.sum((A - P) ** 2, axis=1)).astype(np.float32) |
| dot_ar = np.zeros(len(pairs), dtype=np.float32) |
| cos_ar = np.zeros(len(pairs), dtype=np.float32) |
| dot_pct = np.zeros(len(pairs), dtype=np.float32) |
| cos_pct = np.zeros(len(pairs), dtype=np.float32) |
| df = pd.DataFrame({"idx": np.arange(len(pairs)), "author": pairs[:, 0], "dot": dot, "cos": cos}) |
| for _, g in df.groupby("author", sort=False): |
| idx = g["idx"].to_numpy() |
| n = len(idx) |
| vals = np.linspace(0, 1, n, dtype=np.float32) if n > 1 else np.array([1.0], dtype=np.float32) |
| od = np.argsort(g["dot"].to_numpy(), kind="mergesort") |
| oc = np.argsort(g["cos"].to_numpy(), kind="mergesort") |
| dot_ar[idx[od]] = np.arange(n, dtype=np.float32) |
| cos_ar[idx[oc]] = np.arange(n, dtype=np.float32) |
| dot_pct[idx[od]] = vals |
| cos_pct[idx[oc]] = vals |
| X = np.column_stack([dot, cos, had, absdiff, l2, rank01(dot), rank01(cos), dot_ar, cos_ar, dot_pct, cos_pct]).astype(np.float32) |
| np.savez_compressed(path, X=X) |
| return X, names |
|
|
|
|
| def build_base_features(root: Path, split_seed: int, main_score_file: Path): |
| stack = load_module("stack", root / "code" / "stack_rank_calibration.py") |
| lgcn = load_module("lgcn", root / "code" / "train_val_lgcn_ensemble.py") |
| post = load_module("post", root / "code" / "post95_ablation.py") |
| gen = load_module("gen", root / "code" / "generate_post95_submission.py") |
| extra = load_module("extra", root / "code" / "extra_score_sources_ablation.py") |
| train_refs, val_pairs = lgcn.make_notebook_style_split(root, split_seed, 0.9) |
| pairs = val_pairs[["source", "target"]].to_numpy(np.int64) |
| y = val_pairs["label"].to_numpy(np.int8) |
| main = np.load(main_score_file).astype(np.float32) |
| builder = stack.ExplicitGraphFeatures(root, train_refs) |
| Xh = builder.transform(pairs) |
| X = np.column_stack( |
| [ |
| stack.add_rank_features(pairs, main), |
| Xh, |
| post.negative_evidence_features(Xh, main), |
| gen.topk_content_similarity_fast(root, pairs, builder), |
| ] |
| ).astype(np.float32) |
| selected = [Path(x.strip()) for x in (root / "validation_runs" / f"dynamic_seed{split_seed}" / "post95_submission" / "selected_variant_val_scores.txt").read_text().splitlines() if x.strip()] |
| X = np.column_stack([X, gen.variant_feature_matrix(post, [np.load(p).astype(np.float32) for p in selected])]).astype(np.float32) |
| content = extra.content_mean_score(root, pairs, builder) |
| mf = np.load(root / "validation_runs" / f"dynamic_seed{split_seed}" / "extra_score_sources" / "val_mf_bpr_s202_d256.npy").astype(np.float32) |
| Xc, _ = extra.score_to_features(content, "content_mean_cos", pairs) |
| Xm, _ = extra.score_to_features(mf, "mf_bpr", pairs) |
| X = np.column_stack([X, Xc, Xm]).astype(np.float32) |
| return train_refs, pairs, y, X |
|
|
|
|
| def train_full_predict(X: np.ndarray, y: np.ndarray, X_test: np.ndarray, seed: int): |
| clf = lgb.LGBMClassifier( |
| n_estimators=1200, |
| learning_rate=0.025, |
| num_leaves=31, |
| subsample=0.9, |
| colsample_bytree=0.9, |
| reg_lambda=5.0, |
| min_child_samples=80, |
| objective="binary", |
| verbose=-1, |
| random_state=seed, |
| ) |
| clf.fit(X, y) |
| return clf.predict_proba(X_test)[:, 1].astype(np.float32), clf |
|
|
|
|
| def main() -> None: |
| parser = argparse.ArgumentParser() |
| parser.add_argument("--package-root", type=Path, default=Path(__file__).resolve().parents[1]) |
| parser.add_argument("--split-seed", type=int, default=202) |
| parser.add_argument("--main-val-score-file", type=Path, required=True) |
| parser.add_argument("--workers", type=int, default=8) |
| parser.add_argument("--seed", type=int, default=202) |
| parser.add_argument("--n-splits", type=int, default=5) |
| parser.add_argument("--mode", choices=["small", "graph"], default="small") |
| args = parser.parse_args() |
|
|
| root = args.package_root |
| out_dir = root / "validation_runs" / f"dynamic_seed{args.split_seed}" / "randomwalk_systematic" |
| model_dir = out_dir / "models" |
| out_dir.mkdir(parents=True, exist_ok=True) |
| model_dir.mkdir(parents=True, exist_ok=True) |
|
|
| train_refs, pairs, y, X_base = build_base_features(root, args.split_seed, args.main_val_score_file) |
| configs = small_configs() if args.mode == "small" else graph_configs() |
|
|
| current_best = root / "validation_runs" / f"dynamic_seed{args.split_seed}" / "node2vec_deepwalk_submission" / "submission_content_mf_deepwalk_node2vec_lgb_th0.480000.csv" |
| current_pred = pd.read_csv(current_best)["Predicted"].to_numpy(np.int8) if current_best.exists() else None |
| known = np.load(root / "cached_scores" / "test_known_mask.npy").astype(bool) |
| test_pairs = np.array(read_txt(root / "data_and_docs" / "bipartite_test_ann.txt"), dtype=np.int64) |
|
|
| |
| rows = [] |
| feature_blocks: list[np.ndarray] = [] |
| feature_names: list[list[str]] = [] |
| for cfg in configs: |
| print(f"\n=== {cfg.version_name} ===") |
| G = build_graph(root, train_refs, cfg.graph_type) |
| print(f"graph_type={cfg.graph_type} nodes={G.number_of_nodes()} edges={G.number_of_edges()}") |
| model = train_model(G, cfg, model_dir, args.workers) |
| block, names = pair_feature_block(model, pairs, cfg, root, args.split_seed, train_refs) |
| X = np.column_stack([X_base, block]).astype(np.float32) |
| oof = fit_lgb_oof(X, y, args.seed + len(rows) * 13, args.n_splits) |
| f1, th, auc, p, r = best_f1(y, oof) |
| np.save(out_dir / f"{cfg.version_name}_oof.npy", oof) |
| |
| |
| sub_path = out_dir / "single_submissions" / f"submission_{cfg.version_name}_th0.480000.csv" |
| rows.append( |
| { |
| "version_name": cfg.version_name, |
| "graph_type": cfg.graph_type, |
| "method": cfg.method, |
| "dim": cfg.dim, |
| "walk_length": cfg.walk_length, |
| "num_walks": cfg.num_walks, |
| "window": cfg.window, |
| "p": cfg.p, |
| "q": cfg.q, |
| "validation_F1": f1, |
| "threshold": th, |
| "auc": auc, |
| "precision": p, |
| "recall": r, |
| "predicted_positive_ratio": np.nan, |
| "public_submission_path": str(sub_path), |
| "changed_predictions_vs_current_best": np.nan, |
| "rw_feature_importance_best_rank": np.nan, |
| } |
| ) |
| feature_blocks.append(block) |
| feature_names.append(names) |
| result = pd.DataFrame(rows).sort_values("validation_F1", ascending=False) |
| result.to_csv(out_dir / f"{args.mode}_ablation_table.csv", index=False) |
| print(result.to_string(index=False)) |
|
|
| |
| result = pd.DataFrame(rows).sort_values("validation_F1", ascending=False) |
| top_idx = result.index[: min(5, len(result))].to_list() |
| blocks = [feature_blocks[i] for i in top_idx] |
| cos_cols = [b[:, 1] for b in blocks] |
| dot_cols = [b[:, 0] for b in blocks] |
| ar_cols = [b[:, 10] for b in blocks] |
| cos_stack = np.vstack(cos_cols) |
| dot_stack = np.vstack(dot_cols) |
| ar_stack = np.vstack(ar_cols) |
| agree = (ar_stack >= 0.5).sum(axis=0).astype(np.float32) |
| agg = np.column_stack( |
| [ |
| cos_stack.mean(axis=0), |
| cos_stack.std(axis=0), |
| cos_stack.max(axis=0), |
| cos_stack.min(axis=0), |
| dot_stack.mean(axis=0), |
| dot_stack.std(axis=0), |
| ar_stack.mean(axis=0), |
| ar_stack.std(axis=0), |
| ar_stack.max(axis=0), |
| agree, |
| ] |
| ).astype(np.float32) |
| X_ens = np.column_stack([X_base, *blocks, agg]).astype(np.float32) |
| oof = fit_lgb_oof(X_ens, y, args.seed + 999, args.n_splits) |
| f1, th, auc, p, r = best_f1(y, oof) |
| np.save(out_dir / f"{args.mode}_ensemble_oof.npy", oof) |
| ens_row = { |
| "version_name": f"{args.mode}_top{len(blocks)}_rw_ensemble", |
| "graph_type": "mixed", |
| "method": "RWEnsemble", |
| "dim": np.nan, |
| "walk_length": np.nan, |
| "num_walks": np.nan, |
| "window": np.nan, |
| "p": np.nan, |
| "q": np.nan, |
| "validation_F1": f1, |
| "threshold": th, |
| "auc": auc, |
| "precision": p, |
| "recall": r, |
| "predicted_positive_ratio": np.nan, |
| "public_submission_path": str(root / "validation_runs" / f"dynamic_seed{args.split_seed}" / "randomwalk_ensemble_submission"), |
| "changed_predictions_vs_current_best": np.nan, |
| "rw_feature_importance_best_rank": np.nan, |
| } |
| result = pd.concat([result, pd.DataFrame([ens_row])], ignore_index=True).sort_values("validation_F1", ascending=False) |
| result.to_csv(out_dir / f"{args.mode}_ablation_table.csv", index=False) |
| print("\nFinal table:") |
| print(result.to_string(index=False)) |
|
|
|
|
| if __name__ == "__main__": |
| main() |
|
|