cs3319-project2 / code /node2vec_deepwalk_ablation.py
NLP-beginner's picture
CS3319 Project 2 final deliverable (public F1 = 0.96626)
f28d994
Raw
History Blame Contribute Delete
11 kB
"""DeepWalk/Node2Vec score sources for the post95 stacker."""
from __future__ import annotations
import argparse
import importlib.util
from pathlib import Path
import lightgbm as lgb
import networkx as nx
import numpy as np
import pandas as pd
from gensim.models import Word2Vec
from node2vec import Node2Vec
from sklearn.metrics import precision_recall_curve, roc_auc_score
from sklearn.model_selection import StratifiedKFold
def load_module(name: str, path: Path):
spec = importlib.util.spec_from_file_location(name, path)
module = importlib.util.module_from_spec(spec)
assert spec.loader is not None
spec.loader.exec_module(module)
return module
def read_txt(path: Path) -> list[list[int]]:
return [list(map(int, line.strip().split())) for line in path.open()]
def best_f1(y: np.ndarray, s: np.ndarray):
p, r, t = precision_recall_curve(y, s)
f = 2 * p * r / (p + r + 1e-12)
i = int(np.argmax(f))
th = float(t[i]) if i < len(t) else 0.5
return float(f[i]), th, float(roc_auc_score(y, s)), float(p[i]), float(r[i])
def rank01(x: np.ndarray) -> np.ndarray:
order = np.argsort(x, kind="mergesort")
out = np.empty(len(x), dtype=np.float32)
out[order] = np.linspace(0, 1, len(x), dtype=np.float32)
return out
def zscore(x: np.ndarray) -> np.ndarray:
return ((x - x.mean()) / (x.std() + 1e-8)).astype(np.float32)
def fit_lgb_oof(X: np.ndarray, y: np.ndarray, seed: int, n_splits: int) -> np.ndarray:
oof = np.zeros(len(y), dtype=np.float32)
skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=seed)
for fold, (tr, va) in enumerate(skf.split(X, y), start=1):
clf = lgb.LGBMClassifier(
n_estimators=1200,
learning_rate=0.025,
num_leaves=31,
subsample=0.9,
colsample_bytree=0.9,
reg_lambda=5.0,
min_child_samples=80,
objective="binary",
verbose=-1,
random_state=seed + fold,
)
clf.fit(X[tr], y[tr])
oof[va] = clf.predict_proba(X[va])[:, 1]
return oof
def score_to_features(scores: np.ndarray, prefix: str, pairs: np.ndarray) -> tuple[np.ndarray, list[str]]:
author_rank = np.zeros(len(scores), dtype=np.float32)
df = pd.DataFrame({"idx": np.arange(len(scores)), "author": pairs[:, 0], "score": scores})
for _, g in df.groupby("author", sort=False):
idx = g["idx"].to_numpy()
order = np.argsort(g["score"].to_numpy(), kind="mergesort")
vals = np.linspace(0, 1, len(idx), dtype=np.float32) if len(idx) > 1 else np.array([1.0], dtype=np.float32)
author_rank[idx[order]] = vals
return np.column_stack([scores, zscore(scores), rank01(scores), author_rank]).astype(np.float32), [
prefix,
f"{prefix}_z",
f"{prefix}_rank",
f"{prefix}_author_rank",
]
def build_graph(root: Path, train_refs: pd.DataFrame) -> nx.Graph:
data_dir = root / "data_and_docs"
G = nx.Graph()
G.add_nodes_from([f"a{a}" for a in range(6611)])
G.add_nodes_from([f"p{p}" for p in range(79937)])
for a, p in train_refs[["source", "target"]].to_numpy(np.int64):
G.add_edge(f"a{int(a)}", f"p{int(p)}", weight=3.0)
for a, b in read_txt(data_dir / "author_file_ann.txt"):
G.add_edge(f"a{a}", f"a{b}", weight=1.0)
for s, t in read_txt(data_dir / "paper_file_ann.txt"):
G.add_edge(f"p{s}", f"p{t}", weight=1.0)
return G
def deepwalk_walks(G: nx.Graph, walk_length: int, num_walks: int, seed: int) -> list[list[str]]:
rng = np.random.default_rng(seed)
nodes = np.array(list(G.nodes()), dtype=object)
neigh = {n: list(G.neighbors(n)) for n in G.nodes()}
walks: list[list[str]] = []
for _ in range(num_walks):
order = nodes.copy()
rng.shuffle(order)
for start in order:
walk = [start]
cur = start
for _step in range(walk_length - 1):
ns = neigh[cur]
if not ns:
break
cur = ns[int(rng.integers(0, len(ns)))]
walk.append(cur)
walks.append(walk)
return walks
def train_deepwalk(G: nx.Graph, out_path: Path, dim: int, walk_length: int, num_walks: int, window: int, seed: int, workers: int) -> Word2Vec:
if out_path.exists():
return Word2Vec.load(str(out_path))
walks = deepwalk_walks(G, walk_length, num_walks, seed)
model = Word2Vec(
sentences=walks,
vector_size=dim,
window=window,
min_count=0,
sg=1,
negative=5,
epochs=3,
workers=workers,
seed=seed,
)
model.save(str(out_path))
return model
def train_node2vec(G: nx.Graph, out_path: Path, dim: int, walk_length: int, num_walks: int, window: int, p: float, q: float, seed: int, workers: int) -> Word2Vec:
if out_path.exists():
return Word2Vec.load(str(out_path))
n2v = Node2Vec(G, dimensions=dim, walk_length=walk_length, num_walks=num_walks, p=p, q=q, workers=workers, seed=seed, quiet=False)
model = n2v.fit(window=window, min_count=0, batch_words=4096, seed=seed, epochs=3)
model.save(str(out_path))
return model
def pair_scores(model: Word2Vec, pairs: np.ndarray, prefix: str, root: Path, split_seed: int) -> tuple[np.ndarray, np.ndarray]:
cache = root / "validation_runs" / f"dynamic_seed{split_seed}" / "node2vec_deepwalk"
cache.mkdir(parents=True, exist_ok=True)
path_cos = cache / f"{prefix}_cos_{len(pairs)}_{int(pairs[:,0].sum())}_{int(pairs[:,1].sum())}.npy"
path_dot = cache / f"{prefix}_dot_{len(pairs)}_{int(pairs[:,0].sum())}_{int(pairs[:,1].sum())}.npy"
if path_cos.exists() and path_dot.exists():
return np.load(path_cos), np.load(path_dot)
dim = model.vector_size
avec = np.zeros((6611, dim), dtype=np.float32)
pvec = np.zeros((79937, dim), dtype=np.float32)
for a in range(6611):
key = f"a{a}"
if key in model.wv:
avec[a] = model.wv[key]
for p in range(79937):
key = f"p{p}"
if key in model.wv:
pvec[p] = model.wv[key]
A = avec[pairs[:, 0]]
P = pvec[pairs[:, 1]]
dot = np.sum(A * P, axis=1).astype(np.float32)
cos = (dot / ((np.linalg.norm(A, axis=1) + 1e-8) * (np.linalg.norm(P, axis=1) + 1e-8))).astype(np.float32)
np.save(path_cos, cos)
np.save(path_dot, dot)
return cos, dot
def build_current_best_features(root: Path, split_seed: int, main_score_file: Path):
stack = load_module("stack", root / "code" / "stack_rank_calibration.py")
lgcn = load_module("lgcn", root / "code" / "train_val_lgcn_ensemble.py")
post = load_module("post", root / "code" / "post95_ablation.py")
gen = load_module("gen", root / "code" / "generate_post95_submission.py")
extra = load_module("extra", root / "code" / "extra_score_sources_ablation.py")
train_refs, val_pairs = lgcn.make_notebook_style_split(root, split_seed, 0.9)
pairs = val_pairs[["source", "target"]].to_numpy(np.int64)
y = val_pairs["label"].to_numpy(np.int8)
main = np.load(main_score_file).astype(np.float32)
builder = stack.ExplicitGraphFeatures(root, train_refs)
Xh = builder.transform(pairs)
X = np.column_stack(
[
stack.add_rank_features(pairs, main),
Xh,
post.negative_evidence_features(Xh, main),
gen.topk_content_similarity_fast(root, pairs, builder),
]
).astype(np.float32)
selected = [Path(x.strip()) for x in (root / "validation_runs" / f"dynamic_seed{split_seed}" / "post95_submission" / "selected_variant_val_scores.txt").read_text().splitlines() if x.strip()]
X = np.column_stack([X, gen.variant_feature_matrix(post, [np.load(p).astype(np.float32) for p in selected])]).astype(np.float32)
content = extra.content_mean_score(root, pairs, builder)
mf = np.load(root / "validation_runs" / f"dynamic_seed{split_seed}" / "extra_score_sources" / "val_mf_bpr_s202_d256.npy").astype(np.float32)
Xc, _ = score_to_features(content, "content_mean_cos", pairs)
Xm, _ = score_to_features(mf, "mf_bpr", pairs)
X = np.column_stack([X, Xc, Xm]).astype(np.float32)
return train_refs, pairs, y, X
def main() -> None:
parser = argparse.ArgumentParser()
parser.add_argument("--package-root", type=Path, default=Path(__file__).resolve().parents[1])
parser.add_argument("--split-seed", type=int, default=202)
parser.add_argument("--main-val-score-file", type=Path, required=True)
parser.add_argument("--dim", type=int, default=128)
parser.add_argument("--walk-length", type=int, default=24)
parser.add_argument("--num-walks", type=int, default=4)
parser.add_argument("--window", type=int, default=8)
parser.add_argument("--workers", type=int, default=8)
parser.add_argument("--seed", type=int, default=202)
parser.add_argument("--n-splits", type=int, default=5)
args = parser.parse_args()
root = args.package_root
out_dir = root / "validation_runs" / f"dynamic_seed{args.split_seed}" / "node2vec_deepwalk"
out_dir.mkdir(parents=True, exist_ok=True)
train_refs, pairs, y, X_base = build_current_best_features(root, args.split_seed, args.main_val_score_file)
G = build_graph(root, train_refs)
print(f"graph nodes={G.number_of_nodes()} edges={G.number_of_edges()}")
rows = []
base_oof = fit_lgb_oof(X_base, y, args.seed, args.n_splits)
f1, th, auc, p, r = best_f1(y, base_oof)
rows.append({"stage": "content_mf_baseline", "f1": f1, "threshold": th, "auc": auc, "precision": p, "recall": r, "n_features": X_base.shape[1]})
np.save(out_dir / "baseline_oof.npy", base_oof)
blocks = []
for name, model in [
("deepwalk", train_deepwalk(G, out_dir / f"deepwalk_d{args.dim}.model", args.dim, args.walk_length, args.num_walks, args.window, args.seed, args.workers)),
("node2vec", train_node2vec(G, out_dir / f"node2vec_d{args.dim}_p1_q2.model", args.dim, args.walk_length, args.num_walks, args.window, 1.0, 2.0, args.seed, args.workers)),
]:
cos, dot = pair_scores(model, pairs, name, root, args.split_seed)
Xcos, _ = score_to_features(cos, f"{name}_cos", pairs)
Xdot, _ = score_to_features(dot, f"{name}_dot", pairs)
block = np.column_stack([Xcos, Xdot]).astype(np.float32)
blocks.append(block)
X_cur = np.column_stack([X_base, *blocks]).astype(np.float32)
oof = fit_lgb_oof(X_cur, y, args.seed + len(blocks) * 17, args.n_splits)
f1, th, auc, p, r = best_f1(y, oof)
rows.append({"stage": f"+{name}", "f1": f1, "threshold": th, "auc": auc, "precision": p, "recall": r, "n_features": X_cur.shape[1]})
np.save(out_dir / f"{name}_stack_oof.npy", oof)
result = pd.DataFrame(rows).sort_values("f1", ascending=False)
result.to_csv(out_dir / "node2vec_deepwalk_ablation.csv", index=False)
print(result.to_string(index=False))
if __name__ == "__main__":
main()