cs3319-project2 / code /randomwalk_systematic_ablation.py
NLP-beginner's picture
CS3319 Project 2 final deliverable (public F1 = 0.96626)
f28d994
Raw
History Blame Contribute Delete
18.3 kB
"""Systematic DeepWalk/Node2Vec ablations on top of the current stacker."""
from __future__ import annotations
import argparse
import importlib.util
from dataclasses import dataclass
from pathlib import Path
import lightgbm as lgb
import networkx as nx
import numpy as np
import pandas as pd
from gensim.models import Word2Vec
from node2vec import Node2Vec
from sklearn.metrics import precision_recall_curve, roc_auc_score
from sklearn.model_selection import StratifiedKFold
def load_module(name: str, path: Path):
spec = importlib.util.spec_from_file_location(name, path)
module = importlib.util.module_from_spec(spec)
assert spec.loader is not None
spec.loader.exec_module(module)
return module
def read_txt(path: Path) -> list[list[int]]:
return [list(map(int, line.strip().split())) for line in path.open()]
def best_f1(y: np.ndarray, s: np.ndarray):
p, r, t = precision_recall_curve(y, s)
f = 2 * p * r / (p + r + 1e-12)
i = int(np.argmax(f))
th = float(t[i]) if i < len(t) else 0.5
return float(f[i]), th, float(roc_auc_score(y, s)), float(p[i]), float(r[i])
def rank01(x: np.ndarray) -> np.ndarray:
order = np.argsort(x, kind="mergesort")
out = np.empty(len(x), dtype=np.float32)
out[order] = np.linspace(0, 1, len(x), dtype=np.float32)
return out
def zscore(x: np.ndarray) -> np.ndarray:
return ((x - x.mean()) / (x.std() + 1e-8)).astype(np.float32)
def fit_lgb_oof(X: np.ndarray, y: np.ndarray, seed: int, n_splits: int) -> np.ndarray:
oof = np.zeros(len(y), dtype=np.float32)
skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=seed)
for fold, (tr, va) in enumerate(skf.split(X, y), start=1):
clf = lgb.LGBMClassifier(
n_estimators=1200,
learning_rate=0.025,
num_leaves=31,
subsample=0.9,
colsample_bytree=0.9,
reg_lambda=5.0,
min_child_samples=80,
objective="binary",
n_jobs=4,
verbose=-1,
random_state=seed + fold,
)
clf.fit(X[tr], y[tr])
oof[va] = clf.predict_proba(X[va])[:, 1]
return oof
@dataclass(frozen=True)
class RWConfig:
version_name: str
graph_type: str
method: str
dim: int
walk_length: int
num_walks: int
window: int
p: float | None = None
q: float | None = None
seed: int = 202
def small_configs() -> list[RWConfig]:
return [
RWConfig("dw_base_d128_l40_w10_win10", "full", "DeepWalk", 128, 40, 10, 10),
RWConfig("dw_long_d128_l80_w10_win10", "full", "DeepWalk", 128, 80, 10, 10),
RWConfig("dw_highdim_d256_l40_w10_win10", "full", "DeepWalk", 256, 40, 10, 10),
RWConfig("n2v_bfs_d128_l40_w10_win10_p1_q2", "full", "Node2Vec", 128, 40, 10, 10, 1.0, 2.0),
RWConfig("n2v_dfs_d128_l40_w10_win10_p1_q0.5", "full", "Node2Vec", 128, 40, 10, 10, 1.0, 0.5),
RWConfig("n2v_bal_d128_l40_w10_win10_p1_q1", "full", "Node2Vec", 128, 40, 10, 10, 1.0, 1.0),
]
def graph_configs() -> list[RWConfig]:
return [
RWConfig("dw_graph_ap_only", "ap_only", "DeepWalk", 128, 40, 10, 10),
RWConfig("dw_graph_ap_aa", "ap_aa", "DeepWalk", 128, 40, 10, 10),
RWConfig("dw_graph_ap_pp", "ap_pp", "DeepWalk", 128, 40, 10, 10),
RWConfig("dw_graph_pp_author_mean", "pp_only_author_mean", "DeepWalk", 128, 40, 10, 10),
]
def extra_configs() -> list[RWConfig]:
return [
RWConfig("dw_seed42_d128_l40_w10_win10", "full", "DeepWalk", 128, 40, 10, 10, seed=42),
RWConfig("dw_seed3407_d128_l40_w10_win10", "full", "DeepWalk", 128, 40, 10, 10, seed=3407),
RWConfig("dw_d64_l40_w10_win10", "full", "DeepWalk", 64, 40, 10, 10),
RWConfig("dw_d256_l80_w10_win10", "full", "DeepWalk", 256, 80, 10, 10),
RWConfig("n2v_p0.5_q1_d128_l40_w10_win10", "full", "Node2Vec", 128, 40, 10, 10, 0.5, 1.0),
RWConfig("n2v_p2_q1_d128_l40_w10_win10", "full", "Node2Vec", 128, 40, 10, 10, 2.0, 1.0),
]
def build_graph(root: Path, train_refs: pd.DataFrame, graph_type: str) -> nx.Graph:
data_dir = root / "data_and_docs"
G = nx.Graph()
if graph_type == "pp_only_author_mean":
G.add_nodes_from([f"p{p}" for p in range(79937)])
else:
G.add_nodes_from([f"a{a}" for a in range(6611)])
G.add_nodes_from([f"p{p}" for p in range(79937)])
if graph_type in {"full", "ap_only", "ap_aa", "ap_pp"}:
for a, p in train_refs[["source", "target"]].to_numpy(np.int64):
G.add_edge(f"a{int(a)}", f"p{int(p)}", weight=3.0)
if graph_type in {"full", "ap_aa"}:
for a, b in read_txt(data_dir / "author_file_ann.txt"):
G.add_edge(f"a{a}", f"a{b}", weight=1.0)
if graph_type in {"full", "ap_pp", "pp_only_author_mean"}:
for s, t in read_txt(data_dir / "paper_file_ann.txt"):
G.add_edge(f"p{s}", f"p{t}", weight=1.0)
return G
def deepwalk_walks(G: nx.Graph, walk_length: int, num_walks: int, seed: int) -> list[list[str]]:
rng = np.random.default_rng(seed)
nodes = np.array(list(G.nodes()), dtype=object)
neigh = {n: list(G.neighbors(n)) for n in G.nodes()}
walks: list[list[str]] = []
for _ in range(num_walks):
order = nodes.copy()
rng.shuffle(order)
for start in order:
walk = [start]
cur = start
for _step in range(walk_length - 1):
ns = neigh[cur]
if not ns:
break
cur = ns[int(rng.integers(0, len(ns)))]
walk.append(cur)
walks.append(walk)
return walks
def train_model(G: nx.Graph, cfg: RWConfig, out_dir: Path, workers: int) -> Word2Vec:
model_path = out_dir / f"{cfg.version_name}.model"
if model_path.exists():
return Word2Vec.load(str(model_path))
if cfg.method == "DeepWalk":
walks = deepwalk_walks(G, cfg.walk_length, cfg.num_walks, cfg.seed)
model = Word2Vec(
sentences=walks,
vector_size=cfg.dim,
window=cfg.window,
min_count=0,
sg=1,
negative=5,
epochs=3,
workers=workers,
seed=cfg.seed,
)
else:
n2v = Node2Vec(
G,
dimensions=cfg.dim,
walk_length=cfg.walk_length,
num_walks=cfg.num_walks,
p=float(cfg.p),
q=float(cfg.q),
workers=workers,
seed=cfg.seed,
quiet=False,
)
model = n2v.fit(window=cfg.window, min_count=0, batch_words=4096, seed=cfg.seed, epochs=3)
model.save(str(model_path))
return model
def embedding_arrays(model: Word2Vec, train_refs: pd.DataFrame | None = None) -> tuple[np.ndarray, np.ndarray]:
dim = model.vector_size
avec = np.zeros((6611, dim), dtype=np.float32)
pvec = np.zeros((79937, dim), dtype=np.float32)
for p in range(79937):
key = f"p{p}"
if key in model.wv:
pvec[p] = model.wv[key]
for a in range(6611):
key = f"a{a}"
if key in model.wv:
avec[a] = model.wv[key]
if train_refs is not None and not np.any(np.abs(avec).sum(axis=1) > 0):
author_papers: list[list[int]] = [[] for _ in range(6611)]
for a, p in train_refs[["source", "target"]].to_numpy(np.int64):
author_papers[int(a)].append(int(p))
for a, hist in enumerate(author_papers):
if hist:
avec[a] = pvec[np.asarray(hist, dtype=np.int64)].mean(axis=0)
return avec, pvec
def pair_feature_block(
model: Word2Vec,
pairs: np.ndarray,
cfg: RWConfig,
root: Path,
split_seed: int,
train_refs: pd.DataFrame,
) -> tuple[np.ndarray, list[str]]:
cache_dir = root / "validation_runs" / f"dynamic_seed{split_seed}" / "randomwalk_systematic" / "pair_features"
cache_dir.mkdir(parents=True, exist_ok=True)
key = f"{cfg.version_name}_{len(pairs)}_{int(pairs[:,0].sum())}_{int(pairs[:,1].sum())}.npz"
path = cache_dir / key
names = [
"dot",
"cos",
"hadamard_mean",
"absdiff_mean",
"l2_distance",
"dot_global_rank",
"cos_global_rank",
"dot_author_rank",
"cos_author_rank",
"dot_author_pct",
"cos_author_pct",
]
names = [f"{cfg.version_name}_{n}" for n in names]
if path.exists():
return np.load(path)["X"].astype(np.float32), names
avec, pvec = embedding_arrays(model, train_refs if cfg.graph_type == "pp_only_author_mean" else None)
A = avec[pairs[:, 0]]
P = pvec[pairs[:, 1]]
dot = np.sum(A * P, axis=1).astype(np.float32)
cos = (dot / ((np.linalg.norm(A, axis=1) + 1e-8) * (np.linalg.norm(P, axis=1) + 1e-8))).astype(np.float32)
had = np.mean(A * P, axis=1).astype(np.float32)
absdiff = np.mean(np.abs(A - P), axis=1).astype(np.float32)
l2 = np.sqrt(np.sum((A - P) ** 2, axis=1)).astype(np.float32)
dot_ar = np.zeros(len(pairs), dtype=np.float32)
cos_ar = np.zeros(len(pairs), dtype=np.float32)
dot_pct = np.zeros(len(pairs), dtype=np.float32)
cos_pct = np.zeros(len(pairs), dtype=np.float32)
df = pd.DataFrame({"idx": np.arange(len(pairs)), "author": pairs[:, 0], "dot": dot, "cos": cos})
for _, g in df.groupby("author", sort=False):
idx = g["idx"].to_numpy()
n = len(idx)
vals = np.linspace(0, 1, n, dtype=np.float32) if n > 1 else np.array([1.0], dtype=np.float32)
od = np.argsort(g["dot"].to_numpy(), kind="mergesort")
oc = np.argsort(g["cos"].to_numpy(), kind="mergesort")
dot_ar[idx[od]] = np.arange(n, dtype=np.float32)
cos_ar[idx[oc]] = np.arange(n, dtype=np.float32)
dot_pct[idx[od]] = vals
cos_pct[idx[oc]] = vals
X = np.column_stack([dot, cos, had, absdiff, l2, rank01(dot), rank01(cos), dot_ar, cos_ar, dot_pct, cos_pct]).astype(np.float32)
np.savez_compressed(path, X=X)
return X, names
def build_base_features(root: Path, split_seed: int, main_score_file: Path):
stack = load_module("stack", root / "code" / "stack_rank_calibration.py")
lgcn = load_module("lgcn", root / "code" / "train_val_lgcn_ensemble.py")
post = load_module("post", root / "code" / "post95_ablation.py")
gen = load_module("gen", root / "code" / "generate_post95_submission.py")
extra = load_module("extra", root / "code" / "extra_score_sources_ablation.py")
train_refs, val_pairs = lgcn.make_notebook_style_split(root, split_seed, 0.9)
pairs = val_pairs[["source", "target"]].to_numpy(np.int64)
y = val_pairs["label"].to_numpy(np.int8)
main = np.load(main_score_file).astype(np.float32)
builder = stack.ExplicitGraphFeatures(root, train_refs)
Xh = builder.transform(pairs)
X = np.column_stack(
[
stack.add_rank_features(pairs, main),
Xh,
post.negative_evidence_features(Xh, main),
gen.topk_content_similarity_fast(root, pairs, builder),
]
).astype(np.float32)
selected = [Path(x.strip()) for x in (root / "validation_runs" / f"dynamic_seed{split_seed}" / "post95_submission" / "selected_variant_val_scores.txt").read_text().splitlines() if x.strip()]
X = np.column_stack([X, gen.variant_feature_matrix(post, [np.load(p).astype(np.float32) for p in selected])]).astype(np.float32)
content = extra.content_mean_score(root, pairs, builder)
mf = np.load(root / "validation_runs" / f"dynamic_seed{split_seed}" / "extra_score_sources" / "val_mf_bpr_s202_d256.npy").astype(np.float32)
Xc, _ = extra.score_to_features(content, "content_mean_cos", pairs)
Xm, _ = extra.score_to_features(mf, "mf_bpr", pairs)
X = np.column_stack([X, Xc, Xm]).astype(np.float32)
return train_refs, pairs, y, X
def train_full_predict(X: np.ndarray, y: np.ndarray, X_test: np.ndarray, seed: int):
clf = lgb.LGBMClassifier(
n_estimators=1200,
learning_rate=0.025,
num_leaves=31,
subsample=0.9,
colsample_bytree=0.9,
reg_lambda=5.0,
min_child_samples=80,
objective="binary",
verbose=-1,
random_state=seed,
)
clf.fit(X, y)
return clf.predict_proba(X_test)[:, 1].astype(np.float32), clf
def main() -> None:
parser = argparse.ArgumentParser()
parser.add_argument("--package-root", type=Path, default=Path(__file__).resolve().parents[1])
parser.add_argument("--split-seed", type=int, default=202)
parser.add_argument("--main-val-score-file", type=Path, required=True)
parser.add_argument("--workers", type=int, default=8)
parser.add_argument("--seed", type=int, default=202)
parser.add_argument("--n-splits", type=int, default=5)
parser.add_argument("--mode", choices=["small", "graph"], default="small")
args = parser.parse_args()
root = args.package_root
out_dir = root / "validation_runs" / f"dynamic_seed{args.split_seed}" / "randomwalk_systematic"
model_dir = out_dir / "models"
out_dir.mkdir(parents=True, exist_ok=True)
model_dir.mkdir(parents=True, exist_ok=True)
train_refs, pairs, y, X_base = build_base_features(root, args.split_seed, args.main_val_score_file)
configs = small_configs() if args.mode == "small" else graph_configs()
current_best = root / "validation_runs" / f"dynamic_seed{args.split_seed}" / "node2vec_deepwalk_submission" / "submission_content_mf_deepwalk_node2vec_lgb_th0.480000.csv"
current_pred = pd.read_csv(current_best)["Predicted"].to_numpy(np.int8) if current_best.exists() else None
known = np.load(root / "cached_scores" / "test_known_mask.npy").astype(bool)
test_pairs = np.array(read_txt(root / "data_and_docs" / "bipartite_test_ann.txt"), dtype=np.int64)
# Base test matrix is expensive to rebuild; reuse the previous final score path for changed-pred comparisons only.
rows = []
feature_blocks: list[np.ndarray] = []
feature_names: list[list[str]] = []
for cfg in configs:
print(f"\n=== {cfg.version_name} ===")
G = build_graph(root, train_refs, cfg.graph_type)
print(f"graph_type={cfg.graph_type} nodes={G.number_of_nodes()} edges={G.number_of_edges()}")
model = train_model(G, cfg, model_dir, args.workers)
block, names = pair_feature_block(model, pairs, cfg, root, args.split_seed, train_refs)
X = np.column_stack([X_base, block]).astype(np.float32)
oof = fit_lgb_oof(X, y, args.seed + len(rows) * 13, args.n_splits)
f1, th, auc, p, r = best_f1(y, oof)
np.save(out_dir / f"{cfg.version_name}_oof.npy", oof)
# Full test generation is delegated to the ensemble script for selected versions;
# single-version submission paths are recorded as intended paths.
sub_path = out_dir / "single_submissions" / f"submission_{cfg.version_name}_th0.480000.csv"
rows.append(
{
"version_name": cfg.version_name,
"graph_type": cfg.graph_type,
"method": cfg.method,
"dim": cfg.dim,
"walk_length": cfg.walk_length,
"num_walks": cfg.num_walks,
"window": cfg.window,
"p": cfg.p,
"q": cfg.q,
"validation_F1": f1,
"threshold": th,
"auc": auc,
"precision": p,
"recall": r,
"predicted_positive_ratio": np.nan,
"public_submission_path": str(sub_path),
"changed_predictions_vs_current_best": np.nan,
"rw_feature_importance_best_rank": np.nan,
}
)
feature_blocks.append(block)
feature_names.append(names)
result = pd.DataFrame(rows).sort_values("validation_F1", ascending=False)
result.to_csv(out_dir / f"{args.mode}_ablation_table.csv", index=False)
print(result.to_string(index=False))
# Ensemble top 5 by validation F1 using aggregate random-walk features.
result = pd.DataFrame(rows).sort_values("validation_F1", ascending=False)
top_idx = result.index[: min(5, len(result))].to_list()
blocks = [feature_blocks[i] for i in top_idx]
cos_cols = [b[:, 1] for b in blocks]
dot_cols = [b[:, 0] for b in blocks]
ar_cols = [b[:, 10] for b in blocks] # cosine author percentile
cos_stack = np.vstack(cos_cols)
dot_stack = np.vstack(dot_cols)
ar_stack = np.vstack(ar_cols)
agree = (ar_stack >= 0.5).sum(axis=0).astype(np.float32)
agg = np.column_stack(
[
cos_stack.mean(axis=0),
cos_stack.std(axis=0),
cos_stack.max(axis=0),
cos_stack.min(axis=0),
dot_stack.mean(axis=0),
dot_stack.std(axis=0),
ar_stack.mean(axis=0),
ar_stack.std(axis=0),
ar_stack.max(axis=0),
agree,
]
).astype(np.float32)
X_ens = np.column_stack([X_base, *blocks, agg]).astype(np.float32)
oof = fit_lgb_oof(X_ens, y, args.seed + 999, args.n_splits)
f1, th, auc, p, r = best_f1(y, oof)
np.save(out_dir / f"{args.mode}_ensemble_oof.npy", oof)
ens_row = {
"version_name": f"{args.mode}_top{len(blocks)}_rw_ensemble",
"graph_type": "mixed",
"method": "RWEnsemble",
"dim": np.nan,
"walk_length": np.nan,
"num_walks": np.nan,
"window": np.nan,
"p": np.nan,
"q": np.nan,
"validation_F1": f1,
"threshold": th,
"auc": auc,
"precision": p,
"recall": r,
"predicted_positive_ratio": np.nan,
"public_submission_path": str(root / "validation_runs" / f"dynamic_seed{args.split_seed}" / "randomwalk_ensemble_submission"),
"changed_predictions_vs_current_best": np.nan,
"rw_feature_importance_best_rank": np.nan,
}
result = pd.concat([result, pd.DataFrame([ens_row])], ignore_index=True).sort_values("validation_F1", ascending=False)
result.to_csv(out_dir / f"{args.mode}_ablation_table.csv", index=False)
print("\nFinal table:")
print(result.to_string(index=False))
if __name__ == "__main__":
main()