cs3319-project2 / code /generate_large_ensemble_submission.py

CS3319 Project 2 final deliverable (public F1 = 0.96626)

f28d994 12 days ago

12.5 kB

	"""Generate larger ensemble submission candidates.

	This script extends the confirmed 6-model LightGCN ensemble by using every
	compatible checkpoint in `checkpoints/extra_models/`, and optionally blends
	rank-normalized cached BPR / LightGBM scores.

	Run from the package root:

	python code/generate_large_ensemble_submission.py
	"""

	from __future__ import annotations

	import argparse
	import pickle as pkl
	from pathlib import Path

	import numpy as np
	import pandas as pd
	import torch
	import torch.nn as nn
	from numpy.linalg import norm
	from torch_geometric.data import HeteroData


	EDGE_TYPES = [
	("author", "ref", "paper"),
	("paper", "beref", "author"),
	("paper", "cite", "paper"),
	("author", "coauthor", "author"),
	]


	def read_txt(path: Path) -> list[list[int]]:
	rows: list[list[int]] = []
	with path.open("r") as f:
	for line in f:
	rows.append(list(map(int, line.strip().split())))
	return rows


	def log_norm(x: np.ndarray) -> np.ndarray:
	x = np.log1p(x)
	return (x - x.mean()) / (x.std() + 1e-8)


	class LightGCNLayer(nn.Module):
	def forward(self, x_dict, edge_index_dict):
	agg_dict = {node_type: [] for node_type in x_dict}
	for edge_type in EDGE_TYPES:
	if edge_type not in edge_index_dict:
	continue
	src_type, _, dst_type = edge_type
	src, dst = edge_index_dict[edge_type]
	src_x = x_dict[src_type]
	agg = src_x.new_zeros((x_dict[dst_type].size(0), src_x.size(-1)))
	deg = src_x.new_zeros((x_dict[dst_type].size(0), 1))
	agg.index_add_(0, dst, src_x[src])
	deg.index_add_(
	0,
	dst,
	torch.ones((dst.numel(), 1), dtype=src_x.dtype, device=src_x.device),
	)
	agg_dict[dst_type].append(agg / deg.clamp(min=1.0))

	return {
	node_type: sum(aggs) / len(aggs) if aggs else x_dict[node_type]
	for node_type, aggs in agg_dict.items()
	}


	class LightGCN(nn.Module):
	def __init__(self, num_authors: int, paper_feat_dim: int, embed_dim: int, num_layers: int = 4):
	super().__init__()
	self.author_emb = nn.Embedding(num_authors, embed_dim)
	self.paper_proj = nn.Linear(paper_feat_dim, embed_dim)
	self.layers = nn.ModuleList([LightGCNLayer() for _ in range(num_layers)])
	self.num_layers = num_layers

	def encode(self, data):
	x_dict = {
	"author": self.author_emb.weight,
	"paper": self.paper_proj(data["paper"].x),
	}
	all_layers = [x_dict]
	for layer in self.layers:
	x_dict = layer(x_dict, data.edge_index_dict)
	all_layers.append(x_dict)
	weight = 1.0 / (self.num_layers + 1)
	return {
	node_type: sum(weight * layer[node_type] for layer in all_layers)
	for node_type in x_dict
	}


	def cos_sim(a: np.ndarray, b: np.ndarray, eps: float = 1e-12) -> np.ndarray:
	return np.sum(a * b, axis=1) / (norm(a, axis=1) * norm(b, axis=1) + eps)


	def build_features(data_dir: Path, num_papers: int):
	citation = read_txt(data_dir / "paper_file_ann.txt")
	existing_refs = read_txt(data_dir / "bipartite_train_ann.txt")
	refs_to_pred = read_txt(data_dir / "bipartite_test_ann.txt")
	coauthor = read_txt(data_dir / "author_file_ann.txt")

	with (data_dir / "feature.pkl").open("rb") as f:
	paper_feature = pkl.load(f)

	paper_ref_deg = np.zeros(num_papers, dtype=np.float32)
	paper_cite_out = np.zeros(num_papers, dtype=np.float32)
	paper_cite_in = np.zeros(num_papers, dtype=np.float32)

	for _, paper in existing_refs:
	paper_ref_deg[paper] += 1
	for source, target in citation:
	paper_cite_out[source] += 1
	paper_cite_in[target] += 1

	paper_feat_np = paper_feature.numpy().astype(np.float32)
	paper_deg_feat = np.stack(
	[log_norm(paper_ref_deg), log_norm(paper_cite_out), log_norm(paper_cite_in)],
	axis=-1,
	)
	paper_feat_aug = np.concatenate([paper_feat_np, paper_deg_feat], axis=-1)
	paper_feat_aug = (paper_feat_aug - paper_feat_aug.mean(axis=0)) / (
	paper_feat_aug.std(axis=0) + 1e-8
	)

	return {
	"citation": pd.DataFrame(citation, columns=["source", "target"]),
	"existing_refs": existing_refs,
	"refs_to_pred": refs_to_pred,
	"coauthor": pd.DataFrame(coauthor, columns=["source", "target"]),
	"paper_feat_aug": paper_feat_aug,
	"ref_edges": pd.DataFrame(existing_refs, columns=["source", "target"]),
	}


	def build_data(parts, num_authors: int, num_papers: int, device: torch.device):
	ref_tensor = torch.as_tensor(
	parts["ref_edges"][["source", "target"]].to_numpy(), dtype=torch.long
	)
	cite_tensor = torch.as_tensor(
	parts["citation"][["source", "target"]].to_numpy(), dtype=torch.long
	)
	coauthor_tensor = torch.as_tensor(
	parts["coauthor"][["source", "target"]].to_numpy(), dtype=torch.long
	)

	data = HeteroData()
	data["author"].num_nodes = num_authors
	data["paper"].num_nodes = num_papers
	data["paper"].x = torch.as_tensor(parts["paper_feat_aug"], dtype=torch.float)
	data["author", "ref", "paper"].edge_index = ref_tensor.t().contiguous()
	data["paper", "beref", "author"].edge_index = ref_tensor[:, [1, 0]].t().contiguous()
	data["paper", "cite", "paper"].edge_index = torch.cat(
	[cite_tensor, cite_tensor[:, [1, 0]]], dim=0
	).t().contiguous()
	data["author", "coauthor", "author"].edge_index = torch.cat(
	[coauthor_tensor, coauthor_tensor[:, [1, 0]]], dim=0
	).t().contiguous()
	return data.to(device)


	@torch.no_grad()
	def predict(model: LightGCN, data, pairs: np.ndarray, batch_size: int) -> np.ndarray:
	model.eval()
	z_dict = model.encode(data)
	author_z = z_dict["author"].cpu().numpy()
	paper_z = z_dict["paper"].cpu().numpy()
	scores = []
	for start in range(0, len(pairs), batch_size):
	end = min(start + batch_size, len(pairs))
	batch = pairs[start:end]
	scores.append(cos_sim(author_z[batch[:, 0]], paper_z[batch[:, 1]]).astype(np.float32))
	return np.concatenate(scores)


	def checkpoint_weight(path: Path) -> float:
	name = path.name
	if name in {
	"model_lgcn_s0.pt",
	"model_lgcn_s42.pt",
	"model_lgcn_s2024.pt",
	"model_lgcn_s10.pt",
	"model_lgcn_s100.pt",
	"model_lgcn_dim384_s99.pt",
	}:
	return 1.0
	if name.startswith("model_lgcn_s"):
	return 0.8
	if name.startswith("model_best_"):
	return 0.6
	return 0.5


	def percent_rank(x: np.ndarray) -> np.ndarray:
	order = np.argsort(x, kind="mergesort")
	ranks = np.empty_like(order, dtype=np.float32)
	ranks[order] = np.linspace(0.0, 1.0, num=len(x), dtype=np.float32)
	return ranks


	def write_threshold_submissions(
	scores: np.ndarray,
	known_mask: np.ndarray,
	output_dir: Path,
	prefix: str,
	thresholds: list[float],
	) -> None:
	forced = scores.copy()
	forced[known_mask] = 1.0
	for threshold in thresholds:
	preds = (forced >= threshold).astype(np.int8)
	out = pd.DataFrame(
	[[idx, str(int(pred))] for idx, pred in enumerate(preds)],
	columns=["Index", "Predicted"],
	dtype=object,
	)
	path = output_dir / f"{prefix}_t{threshold:.2f}.csv"
	out.to_csv(path, index=False)
	print(f"{path}: positives={int(preds.sum())} ratio={preds.mean():.6f}")


	def write_top_ratio_submissions(
	scores: np.ndarray,
	known_mask: np.ndarray,
	output_dir: Path,
	prefix: str,
	ratios: list[float],
	) -> None:
	forced = scores.copy()
	forced[known_mask] = np.inf
	order = np.argsort(forced)[::-1]
	for ratio in ratios:
	k = int(round(len(scores) * ratio))
	preds = np.zeros(len(scores), dtype=np.int8)
	preds[order[:k]] = 1
	out = pd.DataFrame(
	[[idx, str(int(pred))] for idx, pred in enumerate(preds)],
	columns=["Index", "Predicted"],
	dtype=object,
	)
	path = output_dir / f"{prefix}_r{ratio:.3f}.csv"
	out.to_csv(path, index=False)
	print(f"{path}: positives={int(preds.sum())} ratio={preds.mean():.6f}")


	def main() -> None:
	parser = argparse.ArgumentParser()
	parser.add_argument("--package-root", type=Path, default=Path(__file__).resolve().parents[1])
	parser.add_argument("--device", default="cuda:0" if torch.cuda.is_available() else "cpu")
	parser.add_argument("--batch-size", type=int, default=65536)
	parser.add_argument("--recompute", action="store_true")
	parser.add_argument(
	"--thresholds",
	nargs="*",
	type=float,
	default=[0.34, 0.35, 0.36, 0.37, 0.38, 0.39, 0.40],
	)
	parser.add_argument(
	"--ratios",
	nargs="*",
	type=float,
	default=[0.505, 0.515, 0.521, 0.530, 0.540],
	)
	args = parser.parse_args()

	root = args.package_root
	data_dir = root / "data_and_docs"
	checkpoint_dir = root / "checkpoints" / "extra_models"
	score_dir = root / "cached_scores" / "large_ensemble"
	output_dir = root / "submissions" / "large_ensemble"
	score_dir.mkdir(parents=True, exist_ok=True)
	output_dir.mkdir(parents=True, exist_ok=True)

	device = torch.device(args.device)
	num_authors = 6611
	num_papers = 79937

	parts = build_features(data_dir, num_papers)
	data = build_data(parts, num_authors, num_papers, device)
	test_arr = np.array(parts["refs_to_pred"], dtype=np.int64)
	train_set = set(map(tuple, parts["existing_refs"]))
	known_mask = np.array([tuple(pair) in train_set for pair in parts["refs_to_pred"]])
	print(f"known positives: {known_mask.sum()} / {len(known_mask)}")

	checkpoints = sorted(checkpoint_dir.glob("*.pt"))
	model_scores = []
	weights = []
	for path in checkpoints:
	cache_path = score_dir / f"{path.stem}.npy"
	if cache_path.exists() and not args.recompute:
	scores = np.load(cache_path).astype(np.float32)
	print(f"{path.name}: loaded cached scores")
	else:
	state = torch.load(path, map_location=device)
	embed_dim = state["author_emb.weight"].shape[1]
	model = LightGCN(num_authors, parts["paper_feat_aug"].shape[1], embed_dim).to(device)
	model.load_state_dict(state)
	scores = predict(model, data, test_arr, args.batch_size)
	np.save(cache_path, scores)
	print(f"{path.name}: computed scores")
	del model
	if device.type == "cuda":
	torch.cuda.empty_cache()
	print(f" mean={scores.mean():.6f} std={scores.std():.6f} weight={checkpoint_weight(path):.2f}")
	model_scores.append(scores)
	weights.append(checkpoint_weight(path))

	score_stack = np.vstack(model_scores).astype(np.float32)
	weights_np = np.array(weights, dtype=np.float32)
	lgcn14_mean = score_stack.mean(axis=0)
	lgcn14_weighted = np.average(score_stack, axis=0, weights=weights_np).astype(np.float32)

	np.save(score_dir / "lgcn14_mean.npy", lgcn14_mean)
	np.save(score_dir / "lgcn14_weighted.npy", lgcn14_weighted)

	write_threshold_submissions(lgcn14_mean, known_mask, output_dir, "sub_lgcn14_mean", args.thresholds)
	write_threshold_submissions(
	lgcn14_weighted, known_mask, output_dir, "sub_lgcn14_weighted", args.thresholds
	)

	cached_dir = root / "cached_scores"
	cached_components = {
	"bpr_cos": np.load(cached_dir / "test_bpr_cos.npy").astype(np.float32),
	"bpr_dot": np.load(cached_dir / "test_bpr_dot.npy").astype(np.float32),
	"lgb": np.load(cached_dir / "test_lgb_scores.npy").astype(np.float32),
	"lgb_v2": np.load(cached_dir / "test_lgb_v2_scores.npy").astype(np.float32),
	}
	rank_blend = 0.74 * percent_rank(lgcn14_weighted)
	rank_blend += 0.10 * percent_rank(cached_components["bpr_cos"])
	rank_blend += 0.06 * percent_rank(cached_components["bpr_dot"])
	rank_blend += 0.05 * percent_rank(cached_components["lgb"])
	rank_blend += 0.05 * percent_rank(cached_components["lgb_v2"])
	rank_blend = rank_blend.astype(np.float32)
	np.save(score_dir / "rank_blend_lgcn14_bpr_lgb.npy", rank_blend)
	write_top_ratio_submissions(rank_blend, known_mask, output_dir, "sub_rankblend_lgcn14_bpr_lgb", args.ratios)


	if __name__ == "__main__":
	main()