cs3319-project2 / code /generate_post95_submission.py

CS3319 Project 2 final deliverable (public F1 = 0.96626)

f28d994 12 days ago

13 kB

	"""Generate test submissions for the post-0.95 stacked ensemble."""

	from __future__ import annotations

	import argparse
	import importlib.util
	import pickle as pkl
	import re
	from pathlib import Path

	import lightgbm as lgb
	import numpy as np
	import pandas as pd
	import torch


	def load_module(name: str, path: Path):
	spec = importlib.util.spec_from_file_location(name, path)
	module = importlib.util.module_from_spec(spec)
	assert spec.loader is not None
	spec.loader.exec_module(module)
	return module


	def read_txt(path: Path) -> list[list[int]]:
	return [list(map(int, line.strip().split())) for line in path.open()]


	def infer_layers(path: Path, state: dict) -> int:
	if "layer_weight" in state:
	return int(state["layer_weight"].shape[0] - 1)
	text = f"{path.parent.parent.name}_{path.name}"
	match = re.search(r"_l(\d+)d", text)
	if match:
	return int(match.group(1))
	match = re.search(r"L(\d+)", text)
	if match:
	return int(match.group(1))
	return 4


	def infer_mode(score_path: Path) -> str:
	name = score_path.name
	if "_dot_" in name:
	return "dot"
	if "_neg_l2_" in name:
	return "neg_l2"
	return "cos"


	def score_cache_path(root: Path, split_seed: int, val_score_path: Path) -> Path:
	val_score_path = val_score_path.resolve()
	rel = val_score_path.relative_to(root / "validation_runs" / f"dynamic_seed{split_seed}")
	name = rel.name.replace("val_", "test_", 1)
	return root / "validation_runs" / f"dynamic_seed{split_seed}" / "post95_test_scores" / rel.parent / name


	def checkpoint_for_score(score_path: Path) -> Path:
	score_path = score_path.resolve()
	stem = score_path.stem.replace("val_", "", 1)
	if stem.endswith("_ensemble_mean"):
	raise ValueError("ensemble scores do not map to a single checkpoint")
	parts = stem.split("_")
	variant = parts[0]
	seed = next(p for p in parts if p.startswith("s") and p[1:].isdigit())
	dim = next(p for p in parts if p.startswith("d") and p[1:].isdigit())
	return score_path.parent.parent / "checkpoints" / f"{variant}_val_{seed}_{dim}.pt"


	def ensemble_member_scores(score_path: Path) -> list[Path]:
	score_path = score_path.resolve()
	result_path = score_path.parent.parent / "ensemble_result.txt"
	text = result_path.read_text().splitlines()
	models_line = next(line for line in text if line.startswith("models="))
	stems = [x.strip() for x in models_line.split("=", 1)[1].split(",") if x.strip()]
	return [score_path.parent / f"{stem}.npy" for stem in stems]


	@torch.no_grad()
	def score_checkpoint_on_test(
	root: Path,
	split_seed: int,
	module,
	parts,
	data_cache: dict,
	test_pairs: np.ndarray,
	val_score_path: Path,
	device: str,
	batch_size: int,
	) -> np.ndarray:
	out_path = score_cache_path(root, split_seed, val_score_path)
	if out_path.exists():
	return np.load(out_path)
	out_path.parent.mkdir(parents=True, exist_ok=True)

	ckpt_path = checkpoint_for_score(val_score_path)
	state = torch.load(ckpt_path, map_location=device)
	embed_dim = state["author_emb.weight"].shape[1]
	layers = infer_layers(ckpt_path, state)
	variant = "learnw" if "learnw" in ckpt_path.name else "vanilla"
	run_name = ckpt_path.parent.parent.name
	use_citation = "no_cite" not in run_name and "author_paper_only" not in run_name
	use_coauthor = "no_coauthor" not in run_name and "author_paper_only" not in run_name
	data_key = (use_citation, use_coauthor)
	if data_key not in data_cache:
	data_cache[data_key] = module.build_data(
	parts,
	6611,
	79937,
	torch.device(device),
	use_citation=use_citation,
	use_coauthor=use_coauthor,
	)
	model_cls = module.LearnableWeightLightGCN if variant == "learnw" else module.LightGCN
	model = model_cls(6611, parts["paper_feat_aug"].shape[1], embed_dim, layers).to(torch.device(device))
	model.load_state_dict(state)
	scores = module.predict_scores(
	model,
	data_cache[data_key],
	test_pairs,
	batch_size,
	mode=infer_mode(val_score_path),
	normalize_embeddings=False,
	).astype(np.float32)
	np.save(out_path, scores)
	del model
	if torch.cuda.is_available():
	torch.cuda.empty_cache()
	print(f"saved {out_path}")
	return scores


	def score_val_path_on_test(
	root: Path,
	split_seed: int,
	module,
	parts,
	data_cache: dict,
	test_pairs: np.ndarray,
	val_score_path: Path,
	device: str,
	batch_size: int,
	) -> np.ndarray:
	out_path = score_cache_path(root, split_seed, val_score_path)
	if out_path.exists():
	return np.load(out_path)
	if val_score_path.name.endswith("_ensemble_mean.npy"):
	members = [
	score_val_path_on_test(root, split_seed, module, parts, data_cache, test_pairs, p, device, batch_size)
	for p in ensemble_member_scores(val_score_path)
	]
	out_path.parent.mkdir(parents=True, exist_ok=True)
	scores = np.mean(members, axis=0).astype(np.float32)
	np.save(out_path, scores)
	print(f"saved {out_path}")
	return scores
	return score_checkpoint_on_test(root, split_seed, module, parts, data_cache, test_pairs, val_score_path, device, batch_size)


	def select_variant_val_scores(post95, root: Path, split_seed: int, y: np.ndarray, max_cols: int) -> list[Path]:
	files = sorted((root / "validation_runs" / f"dynamic_seed{split_seed}").glob("dyn/scores/val_.npy"))
	rows = []
	for path in files:
	if "hgt" in str(path) or "sage" in str(path) or "bce" in str(path) or "norm" in str(path) or "hinge" in str(path):
	continue
	scores = np.load(path).astype(np.float32)
	if len(scores) != len(y) or np.std(scores) < 1e-8:
	continue
	f1, _, auc, _, _ = post95.best_f1(y, scores)
	rows.append((f1, auc, path))
	rows.sort(key=lambda r: r[0], reverse=True)
	return [p for _, _, p in rows[:max_cols]]


	def variant_feature_matrix(post95, raw_scores: list[np.ndarray]) -> np.ndarray:
	if not raw_scores:
	return np.zeros((0, 0), dtype=np.float32)
	cols = []
	for scores in raw_scores:
	cols.extend([post95.zscore(scores), post95.rank01(scores)])
	raw = np.vstack(raw_scores)
	cols.extend([post95.zscore(raw.mean(axis=0)), post95.zscore(raw.std(axis=0)), post95.rank01(raw.mean(axis=0))])
	return np.column_stack(cols).astype(np.float32)


	def topk_content_similarity_fast(root: Path, pairs: np.ndarray, builder) -> np.ndarray:
	cache = root / "validation_runs" / "feature_cache"
	cache.mkdir(parents=True, exist_ok=True)
	key = f"topk_content_{len(pairs)}_{int(pairs[:,0].sum())}_{int(pairs[:,1].sum())}.npy"
	path = cache / key
	if path.exists():
	return np.load(path)

	with (root / "data_and_docs" / "feature.pkl").open("rb") as f:
	feat = pkl.load(f).numpy().astype(np.float32)
	feat /= np.linalg.norm(feat, axis=1, keepdims=True) + 1e-8

	out = np.zeros((len(pairs), 3), dtype=np.float32)
	order = np.argsort(pairs[:, 0], kind="mergesort")
	authors = pairs[order, 0]
	boundaries = np.r_[0, np.flatnonzero(authors[1:] != authors[:-1]) + 1, len(order)]
	for lo, hi in zip(boundaries[:-1], boundaries[1:]):
	idx = order[lo:hi]
	author = int(pairs[idx[0], 0])
	hist = np.asarray(list(builder.author_papers[author]), dtype=np.int64)
	if len(hist) == 0:
	continue
	cand = pairs[idx, 1].astype(np.int64)
	sims = feat[cand] @ feat[hist].T
	out[idx, 0] = sims.max(axis=1)
	for col, k in [(1, 3), (2, 5)]:
	kk = min(k, sims.shape[1])
	top = np.partition(sims, -kk, axis=1)[:, -kk:]
	out[idx, col] = top.mean(axis=1)
	np.save(path, out)
	return out


	def make_submissions(root: Path, out_dir: Path, pred_score: np.ndarray, ratios: list[float]) -> None:
	known = np.load(root / "cached_scores" / "test_known_mask.npy").astype(bool)
	for ratio in ratios:
	n_pos = int(round(len(pred_score) * ratio))
	pred = np.zeros(len(pred_score), dtype=np.int8)
	pred[np.argsort(pred_score)[-n_pos:]] = 1
	pred[known] = 1
	sub = pd.DataFrame({"Index": np.arange(len(pred), dtype=np.int64), "Predicted": pred})
	path = out_dir / f"submission_post95_ens_r{ratio:.3f}.csv"
	sub.to_csv(path, index=False)
	print(f"{path} positives={int(pred.sum())} ratio={pred.mean():.6f}")


	def main() -> None:
	parser = argparse.ArgumentParser()
	parser.add_argument("--package-root", type=Path, default=Path(__file__).resolve().parents[1])
	parser.add_argument("--split-seed", type=int, default=202)
	parser.add_argument("--main-val-score-file", type=Path, required=True)
	parser.add_argument("--device", default="cuda:0" if torch.cuda.is_available() else "cpu")
	parser.add_argument("--batch-size", type=int, default=131072)
	parser.add_argument("--max-variant-cols", type=int, default=20)
	parser.add_argument("--seed", type=int, default=202)
	parser.add_argument("--ratios", nargs="*", type=float, default=[0.498, 0.500, 0.502, 0.504, 0.505])
	args = parser.parse_args()

	root = args.package_root
	args.main_val_score_file = args.main_val_score_file.resolve()
	stack_mod = load_module("stack_rank_calibration", root / "code" / "stack_rank_calibration.py")
	lgcn_mod = load_module("train_val_lgcn_ensemble", root / "code" / "train_val_lgcn_ensemble.py")
	post95 = load_module("post95_ablation", root / "code" / "post95_ablation.py")

	out_dir = root / "validation_runs" / f"dynamic_seed{args.split_seed}" / "post95_submission"
	out_dir.mkdir(parents=True, exist_ok=True)

	train_refs, val_pairs = lgcn_mod.make_notebook_style_split(root, args.split_seed, 0.9)
	val_pairs_arr = val_pairs[["source", "target"]].to_numpy(np.int64)
	y = val_pairs["label"].to_numpy(np.int8)
	main_val_score = np.load(args.main_val_score_file).astype(np.float32)

	print("building validation features")
	val_builder = stack_mod.ExplicitGraphFeatures(root, train_refs)
	X_val_hand = val_builder.transform(val_pairs_arr)
	X_val = np.column_stack(
	[
	stack_mod.add_rank_features(val_pairs_arr, main_val_score),
	X_val_hand,
	post95.negative_evidence_features(X_val_hand, main_val_score),
	topk_content_similarity_fast(root, val_pairs_arr, val_builder),
	]
	).astype(np.float32)

	selected_paths = select_variant_val_scores(post95, root, args.split_seed, y, args.max_variant_cols)
	(out_dir / "selected_variant_val_scores.txt").write_text("\n".join(str(p) for p in selected_paths) + "\n")
	X_val_var = variant_feature_matrix(post95, [np.load(p).astype(np.float32) for p in selected_paths])
	X_val = np.column_stack([X_val, X_val_var]).astype(np.float32)
	print(f"validation matrix {X_val.shape}")

	clf = lgb.LGBMClassifier(
	n_estimators=1200,
	learning_rate=0.025,
	num_leaves=31,
	subsample=0.9,
	colsample_bytree=0.9,
	reg_lambda=5.0,
	min_child_samples=80,
	objective="binary",
	verbose=-1,
	random_state=args.seed,
	)
	clf.fit(X_val, y)

	print("building test features")
	test_pairs = np.array(read_txt(root / "data_and_docs" / "bipartite_test_ann.txt"), dtype=np.int64)
	parts = lgcn_mod.build_parts(root, None, 79937, split_seed=args.split_seed, train_frac=0.9)
	data_cache = {}
	main_test_score = score_val_path_on_test(
	root,
	args.split_seed,
	lgcn_mod,
	parts,
	data_cache,
	test_pairs,
	args.main_val_score_file,
	args.device,
	args.batch_size,
	)
	full_refs = pd.DataFrame(read_txt(root / "data_and_docs" / "bipartite_train_ann.txt"), columns=["source", "target"])
	test_builder = stack_mod.ExplicitGraphFeatures(root, full_refs)
	X_test_hand = test_builder.transform(test_pairs)
	X_test = np.column_stack(
	[
	stack_mod.add_rank_features(test_pairs, main_test_score),
	X_test_hand,
	post95.negative_evidence_features(X_test_hand, main_test_score),
	topk_content_similarity_fast(root, test_pairs, test_builder),
	]
	).astype(np.float32)

	test_variant_scores = [
	score_val_path_on_test(root, args.split_seed, lgcn_mod, parts, data_cache, test_pairs, p, args.device, args.batch_size)
	for p in selected_paths
	]
	X_test_var = variant_feature_matrix(post95, test_variant_scores)
	X_test = np.column_stack([X_test, X_test_var]).astype(np.float32)
	print(f"test matrix {X_test.shape}")

	pred_score = clf.predict_proba(X_test)[:, 1].astype(np.float32)
	np.save(out_dir / "test_post95_ens_pred.npy", pred_score)
	make_submissions(root, out_dir, pred_score, args.ratios)


	if __name__ == "__main__":
	main()