"""DeepWalk/Node2Vec score sources for the post95 stacker.""" from __future__ import annotations import argparse import importlib.util from pathlib import Path import lightgbm as lgb import networkx as nx import numpy as np import pandas as pd from gensim.models import Word2Vec from node2vec import Node2Vec from sklearn.metrics import precision_recall_curve, roc_auc_score from sklearn.model_selection import StratifiedKFold def load_module(name: str, path: Path): spec = importlib.util.spec_from_file_location(name, path) module = importlib.util.module_from_spec(spec) assert spec.loader is not None spec.loader.exec_module(module) return module def read_txt(path: Path) -> list[list[int]]: return [list(map(int, line.strip().split())) for line in path.open()] def best_f1(y: np.ndarray, s: np.ndarray): p, r, t = precision_recall_curve(y, s) f = 2 * p * r / (p + r + 1e-12) i = int(np.argmax(f)) th = float(t[i]) if i < len(t) else 0.5 return float(f[i]), th, float(roc_auc_score(y, s)), float(p[i]), float(r[i]) def rank01(x: np.ndarray) -> np.ndarray: order = np.argsort(x, kind="mergesort") out = np.empty(len(x), dtype=np.float32) out[order] = np.linspace(0, 1, len(x), dtype=np.float32) return out def zscore(x: np.ndarray) -> np.ndarray: return ((x - x.mean()) / (x.std() + 1e-8)).astype(np.float32) def fit_lgb_oof(X: np.ndarray, y: np.ndarray, seed: int, n_splits: int) -> np.ndarray: oof = np.zeros(len(y), dtype=np.float32) skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=seed) for fold, (tr, va) in enumerate(skf.split(X, y), start=1): clf = lgb.LGBMClassifier( n_estimators=1200, learning_rate=0.025, num_leaves=31, subsample=0.9, colsample_bytree=0.9, reg_lambda=5.0, min_child_samples=80, objective="binary", verbose=-1, random_state=seed + fold, ) clf.fit(X[tr], y[tr]) oof[va] = clf.predict_proba(X[va])[:, 1] return oof def score_to_features(scores: np.ndarray, prefix: str, pairs: np.ndarray) -> tuple[np.ndarray, list[str]]: author_rank = np.zeros(len(scores), dtype=np.float32) df = pd.DataFrame({"idx": np.arange(len(scores)), "author": pairs[:, 0], "score": scores}) for _, g in df.groupby("author", sort=False): idx = g["idx"].to_numpy() order = np.argsort(g["score"].to_numpy(), kind="mergesort") vals = np.linspace(0, 1, len(idx), dtype=np.float32) if len(idx) > 1 else np.array([1.0], dtype=np.float32) author_rank[idx[order]] = vals return np.column_stack([scores, zscore(scores), rank01(scores), author_rank]).astype(np.float32), [ prefix, f"{prefix}_z", f"{prefix}_rank", f"{prefix}_author_rank", ] def build_graph(root: Path, train_refs: pd.DataFrame) -> nx.Graph: data_dir = root / "data_and_docs" G = nx.Graph() G.add_nodes_from([f"a{a}" for a in range(6611)]) G.add_nodes_from([f"p{p}" for p in range(79937)]) for a, p in train_refs[["source", "target"]].to_numpy(np.int64): G.add_edge(f"a{int(a)}", f"p{int(p)}", weight=3.0) for a, b in read_txt(data_dir / "author_file_ann.txt"): G.add_edge(f"a{a}", f"a{b}", weight=1.0) for s, t in read_txt(data_dir / "paper_file_ann.txt"): G.add_edge(f"p{s}", f"p{t}", weight=1.0) return G def deepwalk_walks(G: nx.Graph, walk_length: int, num_walks: int, seed: int) -> list[list[str]]: rng = np.random.default_rng(seed) nodes = np.array(list(G.nodes()), dtype=object) neigh = {n: list(G.neighbors(n)) for n in G.nodes()} walks: list[list[str]] = [] for _ in range(num_walks): order = nodes.copy() rng.shuffle(order) for start in order: walk = [start] cur = start for _step in range(walk_length - 1): ns = neigh[cur] if not ns: break cur = ns[int(rng.integers(0, len(ns)))] walk.append(cur) walks.append(walk) return walks def train_deepwalk(G: nx.Graph, out_path: Path, dim: int, walk_length: int, num_walks: int, window: int, seed: int, workers: int) -> Word2Vec: if out_path.exists(): return Word2Vec.load(str(out_path)) walks = deepwalk_walks(G, walk_length, num_walks, seed) model = Word2Vec( sentences=walks, vector_size=dim, window=window, min_count=0, sg=1, negative=5, epochs=3, workers=workers, seed=seed, ) model.save(str(out_path)) return model def train_node2vec(G: nx.Graph, out_path: Path, dim: int, walk_length: int, num_walks: int, window: int, p: float, q: float, seed: int, workers: int) -> Word2Vec: if out_path.exists(): return Word2Vec.load(str(out_path)) n2v = Node2Vec(G, dimensions=dim, walk_length=walk_length, num_walks=num_walks, p=p, q=q, workers=workers, seed=seed, quiet=False) model = n2v.fit(window=window, min_count=0, batch_words=4096, seed=seed, epochs=3) model.save(str(out_path)) return model def pair_scores(model: Word2Vec, pairs: np.ndarray, prefix: str, root: Path, split_seed: int) -> tuple[np.ndarray, np.ndarray]: cache = root / "validation_runs" / f"dynamic_seed{split_seed}" / "node2vec_deepwalk" cache.mkdir(parents=True, exist_ok=True) path_cos = cache / f"{prefix}_cos_{len(pairs)}_{int(pairs[:,0].sum())}_{int(pairs[:,1].sum())}.npy" path_dot = cache / f"{prefix}_dot_{len(pairs)}_{int(pairs[:,0].sum())}_{int(pairs[:,1].sum())}.npy" if path_cos.exists() and path_dot.exists(): return np.load(path_cos), np.load(path_dot) dim = model.vector_size avec = np.zeros((6611, dim), dtype=np.float32) pvec = np.zeros((79937, dim), dtype=np.float32) for a in range(6611): key = f"a{a}" if key in model.wv: avec[a] = model.wv[key] for p in range(79937): key = f"p{p}" if key in model.wv: pvec[p] = model.wv[key] A = avec[pairs[:, 0]] P = pvec[pairs[:, 1]] dot = np.sum(A * P, axis=1).astype(np.float32) cos = (dot / ((np.linalg.norm(A, axis=1) + 1e-8) * (np.linalg.norm(P, axis=1) + 1e-8))).astype(np.float32) np.save(path_cos, cos) np.save(path_dot, dot) return cos, dot def build_current_best_features(root: Path, split_seed: int, main_score_file: Path): stack = load_module("stack", root / "code" / "stack_rank_calibration.py") lgcn = load_module("lgcn", root / "code" / "train_val_lgcn_ensemble.py") post = load_module("post", root / "code" / "post95_ablation.py") gen = load_module("gen", root / "code" / "generate_post95_submission.py") extra = load_module("extra", root / "code" / "extra_score_sources_ablation.py") train_refs, val_pairs = lgcn.make_notebook_style_split(root, split_seed, 0.9) pairs = val_pairs[["source", "target"]].to_numpy(np.int64) y = val_pairs["label"].to_numpy(np.int8) main = np.load(main_score_file).astype(np.float32) builder = stack.ExplicitGraphFeatures(root, train_refs) Xh = builder.transform(pairs) X = np.column_stack( [ stack.add_rank_features(pairs, main), Xh, post.negative_evidence_features(Xh, main), gen.topk_content_similarity_fast(root, pairs, builder), ] ).astype(np.float32) selected = [Path(x.strip()) for x in (root / "validation_runs" / f"dynamic_seed{split_seed}" / "post95_submission" / "selected_variant_val_scores.txt").read_text().splitlines() if x.strip()] X = np.column_stack([X, gen.variant_feature_matrix(post, [np.load(p).astype(np.float32) for p in selected])]).astype(np.float32) content = extra.content_mean_score(root, pairs, builder) mf = np.load(root / "validation_runs" / f"dynamic_seed{split_seed}" / "extra_score_sources" / "val_mf_bpr_s202_d256.npy").astype(np.float32) Xc, _ = score_to_features(content, "content_mean_cos", pairs) Xm, _ = score_to_features(mf, "mf_bpr", pairs) X = np.column_stack([X, Xc, Xm]).astype(np.float32) return train_refs, pairs, y, X def main() -> None: parser = argparse.ArgumentParser() parser.add_argument("--package-root", type=Path, default=Path(__file__).resolve().parents[1]) parser.add_argument("--split-seed", type=int, default=202) parser.add_argument("--main-val-score-file", type=Path, required=True) parser.add_argument("--dim", type=int, default=128) parser.add_argument("--walk-length", type=int, default=24) parser.add_argument("--num-walks", type=int, default=4) parser.add_argument("--window", type=int, default=8) parser.add_argument("--workers", type=int, default=8) parser.add_argument("--seed", type=int, default=202) parser.add_argument("--n-splits", type=int, default=5) args = parser.parse_args() root = args.package_root out_dir = root / "validation_runs" / f"dynamic_seed{args.split_seed}" / "node2vec_deepwalk" out_dir.mkdir(parents=True, exist_ok=True) train_refs, pairs, y, X_base = build_current_best_features(root, args.split_seed, args.main_val_score_file) G = build_graph(root, train_refs) print(f"graph nodes={G.number_of_nodes()} edges={G.number_of_edges()}") rows = [] base_oof = fit_lgb_oof(X_base, y, args.seed, args.n_splits) f1, th, auc, p, r = best_f1(y, base_oof) rows.append({"stage": "content_mf_baseline", "f1": f1, "threshold": th, "auc": auc, "precision": p, "recall": r, "n_features": X_base.shape[1]}) np.save(out_dir / "baseline_oof.npy", base_oof) blocks = [] for name, model in [ ("deepwalk", train_deepwalk(G, out_dir / f"deepwalk_d{args.dim}.model", args.dim, args.walk_length, args.num_walks, args.window, args.seed, args.workers)), ("node2vec", train_node2vec(G, out_dir / f"node2vec_d{args.dim}_p1_q2.model", args.dim, args.walk_length, args.num_walks, args.window, 1.0, 2.0, args.seed, args.workers)), ]: cos, dot = pair_scores(model, pairs, name, root, args.split_seed) Xcos, _ = score_to_features(cos, f"{name}_cos", pairs) Xdot, _ = score_to_features(dot, f"{name}_dot", pairs) block = np.column_stack([Xcos, Xdot]).astype(np.float32) blocks.append(block) X_cur = np.column_stack([X_base, *blocks]).astype(np.float32) oof = fit_lgb_oof(X_cur, y, args.seed + len(blocks) * 17, args.n_splits) f1, th, auc, p, r = best_f1(y, oof) rows.append({"stage": f"+{name}", "f1": f1, "threshold": th, "auc": auc, "precision": p, "recall": r, "n_features": X_cur.shape[1]}) np.save(out_dir / f"{name}_stack_oof.npy", oof) result = pd.DataFrame(rows).sort_values("f1", ascending=False) result.to_csv(out_dir / "node2vec_deepwalk_ablation.csv", index=False) print(result.to_string(index=False)) if __name__ == "__main__": main()