cs3319-project2 / code /content_rich_ablation.py
NLP-beginner's picture
CS3319 Project 2 final deliverable (public F1 = 0.96626)
f28d994
Raw
History Blame Contribute Delete
9.4 kB
"""Richer feature.pkl content features for the post95 + BPR-MF stacker."""
from __future__ import annotations
import argparse
import importlib.util
import pickle as pkl
from pathlib import Path
import lightgbm as lgb
import numpy as np
import pandas as pd
from sklearn.metrics import precision_recall_curve, roc_auc_score
from sklearn.model_selection import StratifiedKFold
def load_module(name: str, path: Path):
spec = importlib.util.spec_from_file_location(name, path)
module = importlib.util.module_from_spec(spec)
assert spec.loader is not None
spec.loader.exec_module(module)
return module
def best_f1(y: np.ndarray, s: np.ndarray):
p, r, t = precision_recall_curve(y, s)
f = 2 * p * r / (p + r + 1e-12)
i = int(np.argmax(f))
th = float(t[i]) if i < len(t) else 0.5
return float(f[i]), th, float(roc_auc_score(y, s)), float(p[i]), float(r[i])
def fit_lgb_oof(X: np.ndarray, y: np.ndarray, seed: int, n_splits: int, *, ranker_like: bool = False) -> np.ndarray:
oof = np.zeros(len(y), dtype=np.float32)
skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=seed)
for fold, (tr, va) in enumerate(skf.split(X, y), start=1):
clf = lgb.LGBMClassifier(
n_estimators=1200 if not ranker_like else 800,
learning_rate=0.025 if not ranker_like else 0.03,
num_leaves=31,
subsample=0.9,
colsample_bytree=0.9,
reg_lambda=5.0,
min_child_samples=80,
objective="binary",
verbose=-1,
random_state=seed + fold,
)
clf.fit(X[tr], y[tr])
oof[va] = clf.predict_proba(X[va])[:, 1]
return oof
def content_rich_features(root: Path, pairs: np.ndarray, builder) -> np.ndarray:
cache = root / "validation_runs" / "feature_cache"
cache.mkdir(parents=True, exist_ok=True)
path = cache / f"content_rich_{len(pairs)}_{int(pairs[:,0].sum())}_{int(pairs[:,1].sum())}.npy"
if path.exists():
return np.load(path)
with (root / "data_and_docs" / "feature.pkl").open("rb") as f:
feat = pkl.load(f).numpy().astype(np.float32)
feat /= np.linalg.norm(feat, axis=1, keepdims=True) + 1e-8
n_authors = builder.num_authors
dim = feat.shape[1]
mean = np.zeros((n_authors, dim), dtype=np.float32)
mean_normed = np.zeros((n_authors, dim), dtype=np.float32)
std_scalar = np.zeros(n_authors, dtype=np.float32)
mean_pair_cos = np.zeros(n_authors, dtype=np.float32)
hist_count = np.zeros(n_authors, dtype=np.float32)
for a in range(n_authors):
hist = np.asarray(list(builder.author_papers[a]), dtype=np.int64)
hist_count[a] = len(hist)
if len(hist) == 0:
continue
H = feat[hist]
m = H.mean(axis=0)
mean[a] = m
mean_normed[a] = m / (np.linalg.norm(m) + 1e-8)
dist = np.sum((H - m) ** 2, axis=1)
std_scalar[a] = float(np.sqrt(dist.mean()))
if len(hist) > 1:
sims = H @ H.T
mean_pair_cos[a] = float((sims.sum() - len(hist)) / (len(hist) * (len(hist) - 1)))
else:
mean_pair_cos[a] = 1.0
out = np.zeros((len(pairs), 18), dtype=np.float32)
order = np.argsort(pairs[:, 0], kind="mergesort")
authors = pairs[order, 0]
boundaries = np.r_[0, np.flatnonzero(authors[1:] != authors[:-1]) + 1, len(order)]
for lo, hi in zip(boundaries[:-1], boundaries[1:]):
idx = order[lo:hi]
a = int(pairs[idx[0], 0])
cand = pairs[idx, 1].astype(np.int64)
C = feat[cand]
center_cos = C @ mean_normed[a]
center_l2 = np.sqrt(np.sum((C - mean[a]) ** 2, axis=1))
out[idx, 0] = center_cos
out[idx, 1] = center_l2
out[idx, 2] = hist_count[a]
out[idx, 3] = np.log1p(hist_count[a])
out[idx, 4] = std_scalar[a]
out[idx, 5] = mean_pair_cos[a]
out[idx, 6] = center_cos / (std_scalar[a] + 1e-3)
hist = np.asarray(list(builder.author_papers[a]), dtype=np.int64)
if len(hist) == 0:
continue
sims = C @ feat[hist].T
out[idx, 7] = sims.max(axis=1)
out[idx, 8] = sims.mean(axis=1)
out[idx, 9] = sims.std(axis=1)
out[idx, 10] = np.median(sims, axis=1)
for col, k in [(11, 3), (12, 5), (13, 10)]:
kk = min(k, sims.shape[1])
out[idx, col] = np.partition(sims, -kk, axis=1)[:, -kk:].mean(axis=1)
out[idx, 14] = (sims > 0.5).mean(axis=1)
out[idx, 15] = (sims > 0.7).mean(axis=1)
# Percentile of candidate center similarity among this author's test/val candidates.
vals = center_cos
local_order = np.argsort(vals, kind="mergesort")
pct = np.linspace(0, 1, len(vals), dtype=np.float32) if len(vals) > 1 else np.array([1.0], dtype=np.float32)
tmp = np.zeros(len(vals), dtype=np.float32)
tmp[local_order] = pct
out[idx, 16] = tmp
out[idx, 17] = 1.0 - tmp
np.save(path, out)
return out
def main() -> None:
parser = argparse.ArgumentParser()
parser.add_argument("--package-root", type=Path, default=Path(__file__).resolve().parents[1])
parser.add_argument("--split-seed", type=int, default=202)
parser.add_argument("--main-val-score-file", type=Path, required=True)
parser.add_argument("--seed", type=int, default=202)
parser.add_argument("--n-splits", type=int, default=5)
args = parser.parse_args()
root = args.package_root
stack = load_module("stack", root / "code" / "stack_rank_calibration.py")
lgcn = load_module("lgcn", root / "code" / "train_val_lgcn_ensemble.py")
post = load_module("post", root / "code" / "post95_ablation.py")
gen = load_module("gen", root / "code" / "generate_post95_submission.py")
extra = load_module("extra", root / "code" / "extra_score_sources_ablation.py")
train_refs, val_pairs = lgcn.make_notebook_style_split(root, args.split_seed, 0.9)
pairs = val_pairs[["source", "target"]].to_numpy(np.int64)
y = val_pairs["label"].to_numpy(np.int8)
main = np.load(args.main_val_score_file).astype(np.float32)
out_dir = root / "validation_runs" / f"dynamic_seed{args.split_seed}" / "content_rich"
out_dir.mkdir(parents=True, exist_ok=True)
builder = stack.ExplicitGraphFeatures(root, train_refs)
X_hand = builder.transform(pairs)
X_base = np.column_stack(
[
stack.add_rank_features(pairs, main),
X_hand,
post.negative_evidence_features(X_hand, main),
gen.topk_content_similarity_fast(root, pairs, builder),
]
).astype(np.float32)
selected = [Path(x.strip()) for x in (root / "validation_runs" / f"dynamic_seed{args.split_seed}" / "post95_submission" / "selected_variant_val_scores.txt").read_text().splitlines() if x.strip()]
X_base = np.column_stack([X_base, gen.variant_feature_matrix(post, [np.load(p).astype(np.float32) for p in selected])]).astype(np.float32)
content = extra.content_mean_score(root, pairs, builder)
mf = np.load(root / "validation_runs" / f"dynamic_seed{args.split_seed}" / "extra_score_sources" / f"val_mf_bpr_s{args.seed}_d256.npy").astype(np.float32)
Xc, _ = extra.score_to_features(content, "content_mean_cos", pairs)
Xm, _ = extra.score_to_features(mf, "mf_bpr", pairs)
X_cm = np.column_stack([X_base, Xc, Xm]).astype(np.float32)
rows = []
print("baseline content+mf stack")
oof = fit_lgb_oof(X_cm, y, args.seed, args.n_splits)
f1, th, auc, p, r = best_f1(y, oof)
rows.append({"stage": "content_mf_baseline", "f1": f1, "threshold": th, "auc": auc, "precision": p, "recall": r, "n_features": X_cm.shape[1]})
np.save(out_dir / "content_mf_baseline_oof.npy", oof)
print("rich content feature-only model")
X_rich = content_rich_features(root, pairs, builder)
rich_oof = fit_lgb_oof(X_rich, y, args.seed + 7, args.n_splits, ranker_like=True)
f1, th, auc, p, r = best_f1(y, rich_oof)
rows.append({"stage": "rich_content_only_lgb", "f1": f1, "threshold": th, "auc": auc, "precision": p, "recall": r, "n_features": X_rich.shape[1]})
np.save(out_dir / "rich_content_only_oof.npy", rich_oof)
print("stack + rich content raw features")
X_all = np.column_stack([X_cm, X_rich]).astype(np.float32)
oof = fit_lgb_oof(X_all, y, args.seed + 11, args.n_splits)
f1, th, auc, p, r = best_f1(y, oof)
rows.append({"stage": "+rich_content_features", "f1": f1, "threshold": th, "auc": auc, "precision": p, "recall": r, "n_features": X_all.shape[1]})
np.save(out_dir / "rich_content_stack_oof.npy", oof)
print("stack + rich content model score")
X_score, _ = extra.score_to_features(rich_oof, "rich_content_lgb_oof", pairs)
X_all_score = np.column_stack([X_all, X_score]).astype(np.float32)
oof = fit_lgb_oof(X_all_score, y, args.seed + 13, args.n_splits)
f1, th, auc, p, r = best_f1(y, oof)
rows.append({"stage": "+rich_content_model_score", "f1": f1, "threshold": th, "auc": auc, "precision": p, "recall": r, "n_features": X_all_score.shape[1]})
np.save(out_dir / "rich_content_model_score_stack_oof.npy", oof)
result = pd.DataFrame(rows).sort_values("f1", ascending=False)
result.to_csv(out_dir / "content_rich_ablation.csv", index=False)
print(result.to_string(index=False))
if __name__ == "__main__":
main()