cs3319-project2 / code /generate_post95_submission.py
NLP-beginner's picture
CS3319 Project 2 final deliverable (public F1 = 0.96626)
f28d994
Raw
History Blame Contribute Delete
13 kB
"""Generate test submissions for the post-0.95 stacked ensemble."""
from __future__ import annotations
import argparse
import importlib.util
import pickle as pkl
import re
from pathlib import Path
import lightgbm as lgb
import numpy as np
import pandas as pd
import torch
def load_module(name: str, path: Path):
spec = importlib.util.spec_from_file_location(name, path)
module = importlib.util.module_from_spec(spec)
assert spec.loader is not None
spec.loader.exec_module(module)
return module
def read_txt(path: Path) -> list[list[int]]:
return [list(map(int, line.strip().split())) for line in path.open()]
def infer_layers(path: Path, state: dict) -> int:
if "layer_weight" in state:
return int(state["layer_weight"].shape[0] - 1)
text = f"{path.parent.parent.name}_{path.name}"
match = re.search(r"_l(\d+)d", text)
if match:
return int(match.group(1))
match = re.search(r"L(\d+)", text)
if match:
return int(match.group(1))
return 4
def infer_mode(score_path: Path) -> str:
name = score_path.name
if "_dot_" in name:
return "dot"
if "_neg_l2_" in name:
return "neg_l2"
return "cos"
def score_cache_path(root: Path, split_seed: int, val_score_path: Path) -> Path:
val_score_path = val_score_path.resolve()
rel = val_score_path.relative_to(root / "validation_runs" / f"dynamic_seed{split_seed}")
name = rel.name.replace("val_", "test_", 1)
return root / "validation_runs" / f"dynamic_seed{split_seed}" / "post95_test_scores" / rel.parent / name
def checkpoint_for_score(score_path: Path) -> Path:
score_path = score_path.resolve()
stem = score_path.stem.replace("val_", "", 1)
if stem.endswith("_ensemble_mean"):
raise ValueError("ensemble scores do not map to a single checkpoint")
parts = stem.split("_")
variant = parts[0]
seed = next(p for p in parts if p.startswith("s") and p[1:].isdigit())
dim = next(p for p in parts if p.startswith("d") and p[1:].isdigit())
return score_path.parent.parent / "checkpoints" / f"{variant}_val_{seed}_{dim}.pt"
def ensemble_member_scores(score_path: Path) -> list[Path]:
score_path = score_path.resolve()
result_path = score_path.parent.parent / "ensemble_result.txt"
text = result_path.read_text().splitlines()
models_line = next(line for line in text if line.startswith("models="))
stems = [x.strip() for x in models_line.split("=", 1)[1].split(",") if x.strip()]
return [score_path.parent / f"{stem}.npy" for stem in stems]
@torch.no_grad()
def score_checkpoint_on_test(
root: Path,
split_seed: int,
module,
parts,
data_cache: dict,
test_pairs: np.ndarray,
val_score_path: Path,
device: str,
batch_size: int,
) -> np.ndarray:
out_path = score_cache_path(root, split_seed, val_score_path)
if out_path.exists():
return np.load(out_path)
out_path.parent.mkdir(parents=True, exist_ok=True)
ckpt_path = checkpoint_for_score(val_score_path)
state = torch.load(ckpt_path, map_location=device)
embed_dim = state["author_emb.weight"].shape[1]
layers = infer_layers(ckpt_path, state)
variant = "learnw" if "learnw" in ckpt_path.name else "vanilla"
run_name = ckpt_path.parent.parent.name
use_citation = "no_cite" not in run_name and "author_paper_only" not in run_name
use_coauthor = "no_coauthor" not in run_name and "author_paper_only" not in run_name
data_key = (use_citation, use_coauthor)
if data_key not in data_cache:
data_cache[data_key] = module.build_data(
parts,
6611,
79937,
torch.device(device),
use_citation=use_citation,
use_coauthor=use_coauthor,
)
model_cls = module.LearnableWeightLightGCN if variant == "learnw" else module.LightGCN
model = model_cls(6611, parts["paper_feat_aug"].shape[1], embed_dim, layers).to(torch.device(device))
model.load_state_dict(state)
scores = module.predict_scores(
model,
data_cache[data_key],
test_pairs,
batch_size,
mode=infer_mode(val_score_path),
normalize_embeddings=False,
).astype(np.float32)
np.save(out_path, scores)
del model
if torch.cuda.is_available():
torch.cuda.empty_cache()
print(f"saved {out_path}")
return scores
def score_val_path_on_test(
root: Path,
split_seed: int,
module,
parts,
data_cache: dict,
test_pairs: np.ndarray,
val_score_path: Path,
device: str,
batch_size: int,
) -> np.ndarray:
out_path = score_cache_path(root, split_seed, val_score_path)
if out_path.exists():
return np.load(out_path)
if val_score_path.name.endswith("_ensemble_mean.npy"):
members = [
score_val_path_on_test(root, split_seed, module, parts, data_cache, test_pairs, p, device, batch_size)
for p in ensemble_member_scores(val_score_path)
]
out_path.parent.mkdir(parents=True, exist_ok=True)
scores = np.mean(members, axis=0).astype(np.float32)
np.save(out_path, scores)
print(f"saved {out_path}")
return scores
return score_checkpoint_on_test(root, split_seed, module, parts, data_cache, test_pairs, val_score_path, device, batch_size)
def select_variant_val_scores(post95, root: Path, split_seed: int, y: np.ndarray, max_cols: int) -> list[Path]:
files = sorted((root / "validation_runs" / f"dynamic_seed{split_seed}").glob("dyn*/scores/val_*.npy"))
rows = []
for path in files:
if "hgt" in str(path) or "sage" in str(path) or "bce" in str(path) or "norm" in str(path) or "hinge" in str(path):
continue
scores = np.load(path).astype(np.float32)
if len(scores) != len(y) or np.std(scores) < 1e-8:
continue
f1, _, auc, _, _ = post95.best_f1(y, scores)
rows.append((f1, auc, path))
rows.sort(key=lambda r: r[0], reverse=True)
return [p for _, _, p in rows[:max_cols]]
def variant_feature_matrix(post95, raw_scores: list[np.ndarray]) -> np.ndarray:
if not raw_scores:
return np.zeros((0, 0), dtype=np.float32)
cols = []
for scores in raw_scores:
cols.extend([post95.zscore(scores), post95.rank01(scores)])
raw = np.vstack(raw_scores)
cols.extend([post95.zscore(raw.mean(axis=0)), post95.zscore(raw.std(axis=0)), post95.rank01(raw.mean(axis=0))])
return np.column_stack(cols).astype(np.float32)
def topk_content_similarity_fast(root: Path, pairs: np.ndarray, builder) -> np.ndarray:
cache = root / "validation_runs" / "feature_cache"
cache.mkdir(parents=True, exist_ok=True)
key = f"topk_content_{len(pairs)}_{int(pairs[:,0].sum())}_{int(pairs[:,1].sum())}.npy"
path = cache / key
if path.exists():
return np.load(path)
with (root / "data_and_docs" / "feature.pkl").open("rb") as f:
feat = pkl.load(f).numpy().astype(np.float32)
feat /= np.linalg.norm(feat, axis=1, keepdims=True) + 1e-8
out = np.zeros((len(pairs), 3), dtype=np.float32)
order = np.argsort(pairs[:, 0], kind="mergesort")
authors = pairs[order, 0]
boundaries = np.r_[0, np.flatnonzero(authors[1:] != authors[:-1]) + 1, len(order)]
for lo, hi in zip(boundaries[:-1], boundaries[1:]):
idx = order[lo:hi]
author = int(pairs[idx[0], 0])
hist = np.asarray(list(builder.author_papers[author]), dtype=np.int64)
if len(hist) == 0:
continue
cand = pairs[idx, 1].astype(np.int64)
sims = feat[cand] @ feat[hist].T
out[idx, 0] = sims.max(axis=1)
for col, k in [(1, 3), (2, 5)]:
kk = min(k, sims.shape[1])
top = np.partition(sims, -kk, axis=1)[:, -kk:]
out[idx, col] = top.mean(axis=1)
np.save(path, out)
return out
def make_submissions(root: Path, out_dir: Path, pred_score: np.ndarray, ratios: list[float]) -> None:
known = np.load(root / "cached_scores" / "test_known_mask.npy").astype(bool)
for ratio in ratios:
n_pos = int(round(len(pred_score) * ratio))
pred = np.zeros(len(pred_score), dtype=np.int8)
pred[np.argsort(pred_score)[-n_pos:]] = 1
pred[known] = 1
sub = pd.DataFrame({"Index": np.arange(len(pred), dtype=np.int64), "Predicted": pred})
path = out_dir / f"submission_post95_ens_r{ratio:.3f}.csv"
sub.to_csv(path, index=False)
print(f"{path} positives={int(pred.sum())} ratio={pred.mean():.6f}")
def main() -> None:
parser = argparse.ArgumentParser()
parser.add_argument("--package-root", type=Path, default=Path(__file__).resolve().parents[1])
parser.add_argument("--split-seed", type=int, default=202)
parser.add_argument("--main-val-score-file", type=Path, required=True)
parser.add_argument("--device", default="cuda:0" if torch.cuda.is_available() else "cpu")
parser.add_argument("--batch-size", type=int, default=131072)
parser.add_argument("--max-variant-cols", type=int, default=20)
parser.add_argument("--seed", type=int, default=202)
parser.add_argument("--ratios", nargs="*", type=float, default=[0.498, 0.500, 0.502, 0.504, 0.505])
args = parser.parse_args()
root = args.package_root
args.main_val_score_file = args.main_val_score_file.resolve()
stack_mod = load_module("stack_rank_calibration", root / "code" / "stack_rank_calibration.py")
lgcn_mod = load_module("train_val_lgcn_ensemble", root / "code" / "train_val_lgcn_ensemble.py")
post95 = load_module("post95_ablation", root / "code" / "post95_ablation.py")
out_dir = root / "validation_runs" / f"dynamic_seed{args.split_seed}" / "post95_submission"
out_dir.mkdir(parents=True, exist_ok=True)
train_refs, val_pairs = lgcn_mod.make_notebook_style_split(root, args.split_seed, 0.9)
val_pairs_arr = val_pairs[["source", "target"]].to_numpy(np.int64)
y = val_pairs["label"].to_numpy(np.int8)
main_val_score = np.load(args.main_val_score_file).astype(np.float32)
print("building validation features")
val_builder = stack_mod.ExplicitGraphFeatures(root, train_refs)
X_val_hand = val_builder.transform(val_pairs_arr)
X_val = np.column_stack(
[
stack_mod.add_rank_features(val_pairs_arr, main_val_score),
X_val_hand,
post95.negative_evidence_features(X_val_hand, main_val_score),
topk_content_similarity_fast(root, val_pairs_arr, val_builder),
]
).astype(np.float32)
selected_paths = select_variant_val_scores(post95, root, args.split_seed, y, args.max_variant_cols)
(out_dir / "selected_variant_val_scores.txt").write_text("\n".join(str(p) for p in selected_paths) + "\n")
X_val_var = variant_feature_matrix(post95, [np.load(p).astype(np.float32) for p in selected_paths])
X_val = np.column_stack([X_val, X_val_var]).astype(np.float32)
print(f"validation matrix {X_val.shape}")
clf = lgb.LGBMClassifier(
n_estimators=1200,
learning_rate=0.025,
num_leaves=31,
subsample=0.9,
colsample_bytree=0.9,
reg_lambda=5.0,
min_child_samples=80,
objective="binary",
verbose=-1,
random_state=args.seed,
)
clf.fit(X_val, y)
print("building test features")
test_pairs = np.array(read_txt(root / "data_and_docs" / "bipartite_test_ann.txt"), dtype=np.int64)
parts = lgcn_mod.build_parts(root, None, 79937, split_seed=args.split_seed, train_frac=0.9)
data_cache = {}
main_test_score = score_val_path_on_test(
root,
args.split_seed,
lgcn_mod,
parts,
data_cache,
test_pairs,
args.main_val_score_file,
args.device,
args.batch_size,
)
full_refs = pd.DataFrame(read_txt(root / "data_and_docs" / "bipartite_train_ann.txt"), columns=["source", "target"])
test_builder = stack_mod.ExplicitGraphFeatures(root, full_refs)
X_test_hand = test_builder.transform(test_pairs)
X_test = np.column_stack(
[
stack_mod.add_rank_features(test_pairs, main_test_score),
X_test_hand,
post95.negative_evidence_features(X_test_hand, main_test_score),
topk_content_similarity_fast(root, test_pairs, test_builder),
]
).astype(np.float32)
test_variant_scores = [
score_val_path_on_test(root, args.split_seed, lgcn_mod, parts, data_cache, test_pairs, p, args.device, args.batch_size)
for p in selected_paths
]
X_test_var = variant_feature_matrix(post95, test_variant_scores)
X_test = np.column_stack([X_test, X_test_var]).astype(np.float32)
print(f"test matrix {X_test.shape}")
pred_score = clf.predict_proba(X_test)[:, 1].astype(np.float32)
np.save(out_dir / "test_post95_ens_pred.npy", pred_score)
make_submissions(root, out_dir, pred_score, args.ratios)
if __name__ == "__main__":
main()