cs3319-project2 / code /rich_randomwalk_stack.py
NLP-beginner's picture
CS3319 Project 2 final deliverable (public F1 = 0.96626)
f28d994
Raw
History Blame Contribute Delete
9.15 kB
"""Add rich content features to the systematic random-walk stack."""
from __future__ import annotations
import argparse
import importlib.util
import sys
from pathlib import Path
import lightgbm as lgb
import numpy as np
import pandas as pd
from gensim.models import Word2Vec
from sklearn.metrics import precision_recall_curve, roc_auc_score
from sklearn.model_selection import StratifiedKFold
def load_module(name: str, path: Path):
spec = importlib.util.spec_from_file_location(name, path)
module = importlib.util.module_from_spec(spec)
assert spec.loader is not None
sys.modules[name] = module
spec.loader.exec_module(module)
return module
def best_f1(y: np.ndarray, s: np.ndarray):
p, r, t = precision_recall_curve(y, s)
f = 2 * p * r / (p + r + 1e-12)
i = int(np.argmax(f))
th = float(t[i]) if i < len(t) else 0.5
return float(f[i]), th, float(roc_auc_score(y, s)), float(p[i]), float(r[i])
def fit_lgb_oof(X: np.ndarray, y: np.ndarray, seed: int, n_splits: int, leaves: int = 31) -> np.ndarray:
oof = np.zeros(len(y), dtype=np.float32)
skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=seed)
for fold, (tr, va) in enumerate(skf.split(X, y), start=1):
clf = lgb.LGBMClassifier(
n_estimators=1200,
learning_rate=0.025,
num_leaves=leaves,
subsample=0.9,
colsample_bytree=0.9,
reg_lambda=7.0,
min_child_samples=80,
objective="binary",
n_jobs=8,
verbose=-1,
random_state=seed + fold,
)
clf.fit(X[tr], y[tr])
oof[va] = clf.predict_proba(X[va])[:, 1].astype(np.float32)
return oof
def fit_full_predict(X: np.ndarray, y: np.ndarray, Xt: np.ndarray, seed: int, leaves: int = 31) -> np.ndarray:
clf = lgb.LGBMClassifier(
n_estimators=1400,
learning_rate=0.022,
num_leaves=leaves,
subsample=0.9,
colsample_bytree=0.9,
reg_lambda=7.0,
min_child_samples=80,
objective="binary",
n_jobs=8,
verbose=-1,
random_state=seed,
)
clf.fit(X, y)
return clf.predict_proba(Xt)[:, 1].astype(np.float32)
def write_sub(path: Path, score: np.ndarray, known: np.ndarray, anchor: np.ndarray, ratio: float) -> tuple[float, int]:
pred = np.zeros(len(score), dtype=np.int8)
pred[np.argsort(score, kind="mergesort")[-int(round(len(score) * ratio)):]] = 1
pred[known] = 1
pd.DataFrame({"Index": np.arange(len(pred), dtype=np.int64), "Predicted": pred}).to_csv(path, index=False)
return float(pred.mean()), int((pred != anchor).sum())
def main() -> None:
ap = argparse.ArgumentParser()
ap.add_argument("--package-root", type=Path, default=Path(__file__).resolve().parents[1])
ap.add_argument("--split-seed", type=int, default=202)
ap.add_argument("--main-val-score-file", type=Path, default=None)
ap.add_argument("--seed", type=int, default=202)
ap.add_argument("--n-splits", type=int, default=5)
args = ap.parse_args()
root = args.package_root.resolve()
main_val = args.main_val_score_file or root / "validation_runs/dynamic_seed202/dyn202_l2d512_bpr_bigbatch_more/scores/val_vanilla_ensemble_mean.npy"
rw = load_module("rw", root / "code/randomwalk_systematic_ablation.py")
stack = load_module("stack", root / "code/stack_rank_calibration.py")
gen = load_module("gen", root / "code/generate_post95_submission.py")
rich = load_module("rich", root / "code/content_rich_ablation.py")
ens = load_module("ens", root / "code/generate_randomwalk_ensemble_submission.py")
out = root / "validation_runs" / f"dynamic_seed{args.split_seed}" / "rich_randomwalk_stack"
sub_dir = out / "submissions"
out.mkdir(parents=True, exist_ok=True)
sub_dir.mkdir(parents=True, exist_ok=True)
versions = [
"dw_base_d128_l40_w10_win10",
"dw_long_d128_l80_w10_win10",
"dw_highdim_d256_l40_w10_win10",
"dw_d256_l80_w10_win10",
"dw_seed3407_d128_l40_w10_win10",
"dw_graph_ap_pp",
"n2v_p2_q1_d128_l40_w10_win10",
]
cfgs = {c.version_name: c for c in rw.small_configs() + rw.graph_configs() + rw.extra_configs()}
sys_dir = root / "validation_runs" / f"dynamic_seed{args.split_seed}" / "randomwalk_systematic"
train_refs, pairs, y, X_base = rw.build_base_features(root, args.split_seed, main_val)
builder = stack.ExplicitGraphFeatures(root, train_refs)
X_rich = rich.content_rich_features(root, pairs, builder)
blocks = []
for version in versions:
cfg = cfgs[version]
model = Word2Vec.load(str(sys_dir / "models" / f"{version}.model"))
block, _ = rw.pair_feature_block(model, pairs, cfg, root, args.split_seed, train_refs)
blocks.append(block)
X = np.column_stack([X_base, X_rich, *blocks, ens.aggregate(blocks)]).astype(np.float32)
print("X", X.shape)
rows = []
for name, leaves in [("rich_rw7_lgb31", 31), ("rich_rw7_lgb15", 15), ("rich_rw7_lgb63", 63)]:
oof = fit_lgb_oof(X, y, args.seed + leaves, args.n_splits, leaves=leaves)
np.save(out / f"{name}_oof.npy", oof)
f1, th, auc, p, r = best_f1(y, oof)
rows.append({"stage": name, "validation_f1": f1, "threshold": th, "auc": auc, "precision": p, "recall": r, "n_features": X.shape[1]})
print(rows[-1])
pd.DataFrame(rows).sort_values("validation_f1", ascending=False).to_csv(out / "validation_summary.csv", index=False)
# Generate test submissions for the best validation model.
best = max(rows, key=lambda r: r["validation_f1"])
best_leaves = int(best["stage"].split("lgb")[-1])
test_pairs = np.array(gen.read_txt(root / "data_and_docs/bipartite_test_ann.txt"), dtype=np.int64)
main_test = np.load(root / "validation_runs/dynamic_seed202/post95_test_scores/dyn202_l2d512_bpr_bigbatch_more/scores/test_vanilla_ensemble_mean.npy").astype(np.float32)
full_refs = pd.DataFrame(gen.read_txt(root / "data_and_docs/bipartite_train_ann.txt"), columns=["source", "target"])
test_builder = stack.ExplicitGraphFeatures(root, full_refs)
Xht = test_builder.transform(test_pairs)
post = load_module("post", root / "code/post95_ablation.py")
extra = load_module("extra", root / "code/extra_score_sources_ablation.py")
Xt = np.column_stack(
[
stack.add_rank_features(test_pairs, main_test),
Xht,
post.negative_evidence_features(Xht, main_test),
gen.topk_content_similarity_fast(root, test_pairs, test_builder),
]
).astype(np.float32)
selected = [Path(x.strip()) for x in (root / "validation_runs/dynamic_seed202/post95_submission/selected_variant_val_scores.txt").read_text().splitlines() if x.strip()]
test_scores = []
for p in selected:
rel = p.resolve().relative_to(root / "validation_runs/dynamic_seed202")
tp = root / "validation_runs/dynamic_seed202/post95_test_scores" / rel.parent / rel.name.replace("val_", "test_", 1)
test_scores.append(np.load(tp).astype(np.float32))
Xt = np.column_stack([Xt, gen.variant_feature_matrix(post, test_scores)]).astype(np.float32)
content_test = extra.content_mean_score(root, test_pairs, test_builder)
mf_test = np.load(root / "validation_runs/dynamic_seed202/extra_bprmf_submission/test_mf_bpr_dynamic_s202_d256_e220.npy").astype(np.float32)
Xct, _ = extra.score_to_features(content_test, "content_mean_cos", test_pairs)
Xmt, _ = extra.score_to_features(mf_test, "mf_bpr", test_pairs)
Xt = np.column_stack([Xt, Xct, Xmt, rich.content_rich_features(root, test_pairs, test_builder)]).astype(np.float32)
test_blocks = []
for version in versions:
cfg = cfgs[version]
model = Word2Vec.load(str(sys_dir / "models" / f"{version}.model"))
block, _ = rw.pair_feature_block(model, test_pairs, cfg, root, args.split_seed, full_refs)
test_blocks.append(block)
Xt = np.column_stack([Xt, *test_blocks, ens.aggregate(test_blocks)]).astype(np.float32)
print("Xt", Xt.shape)
test_score = fit_full_predict(X, y, Xt, args.seed + 500, leaves=best_leaves)
np.save(out / f"{best['stage']}_test_pred.npy", test_score)
known = np.load(root / "cached_scores/test_known_mask.npy").astype(bool)
anchor = pd.read_csv(root / "validation_runs/dynamic_seed202/node2vec_deepwalk_submission/submission_content_mf_deepwalk_node2vec_lgb_th0.480000.csv")["Predicted"].to_numpy(np.int8)
sub_rows = []
for ratio in [0.498, 0.499, 0.500, 0.501, 0.502, float((np.load(out / f"{best['stage']}_oof.npy") >= best["threshold"]).mean())]:
path = sub_dir / f"submission_{best['stage']}_r{ratio:.6f}.csv"
pos, changed = write_sub(path, test_score, known, anchor, ratio)
sub_rows.append({"path": str(path), "ratio": ratio, "test_positive_ratio": pos, "changed_vs_anchor": changed, **best})
pd.DataFrame(sub_rows).to_csv(out / "submission_summary.csv", index=False)
print(pd.DataFrame(sub_rows).to_string(index=False))
if __name__ == "__main__":
main()