| """Generate larger ensemble submission candidates. |
| |
| This script extends the confirmed 6-model LightGCN ensemble by using every |
| compatible checkpoint in `checkpoints/extra_models/`, and optionally blends |
| rank-normalized cached BPR / LightGBM scores. |
| |
| Run from the package root: |
| |
| python code/generate_large_ensemble_submission.py |
| """ |
|
|
| from __future__ import annotations |
|
|
| import argparse |
| import pickle as pkl |
| from pathlib import Path |
|
|
| import numpy as np |
| import pandas as pd |
| import torch |
| import torch.nn as nn |
| from numpy.linalg import norm |
| from torch_geometric.data import HeteroData |
|
|
|
|
| EDGE_TYPES = [ |
| ("author", "ref", "paper"), |
| ("paper", "beref", "author"), |
| ("paper", "cite", "paper"), |
| ("author", "coauthor", "author"), |
| ] |
|
|
|
|
| def read_txt(path: Path) -> list[list[int]]: |
| rows: list[list[int]] = [] |
| with path.open("r") as f: |
| for line in f: |
| rows.append(list(map(int, line.strip().split()))) |
| return rows |
|
|
|
|
| def log_norm(x: np.ndarray) -> np.ndarray: |
| x = np.log1p(x) |
| return (x - x.mean()) / (x.std() + 1e-8) |
|
|
|
|
| class LightGCNLayer(nn.Module): |
| def forward(self, x_dict, edge_index_dict): |
| agg_dict = {node_type: [] for node_type in x_dict} |
| for edge_type in EDGE_TYPES: |
| if edge_type not in edge_index_dict: |
| continue |
| src_type, _, dst_type = edge_type |
| src, dst = edge_index_dict[edge_type] |
| src_x = x_dict[src_type] |
| agg = src_x.new_zeros((x_dict[dst_type].size(0), src_x.size(-1))) |
| deg = src_x.new_zeros((x_dict[dst_type].size(0), 1)) |
| agg.index_add_(0, dst, src_x[src]) |
| deg.index_add_( |
| 0, |
| dst, |
| torch.ones((dst.numel(), 1), dtype=src_x.dtype, device=src_x.device), |
| ) |
| agg_dict[dst_type].append(agg / deg.clamp(min=1.0)) |
|
|
| return { |
| node_type: sum(aggs) / len(aggs) if aggs else x_dict[node_type] |
| for node_type, aggs in agg_dict.items() |
| } |
|
|
|
|
| class LightGCN(nn.Module): |
| def __init__(self, num_authors: int, paper_feat_dim: int, embed_dim: int, num_layers: int = 4): |
| super().__init__() |
| self.author_emb = nn.Embedding(num_authors, embed_dim) |
| self.paper_proj = nn.Linear(paper_feat_dim, embed_dim) |
| self.layers = nn.ModuleList([LightGCNLayer() for _ in range(num_layers)]) |
| self.num_layers = num_layers |
|
|
| def encode(self, data): |
| x_dict = { |
| "author": self.author_emb.weight, |
| "paper": self.paper_proj(data["paper"].x), |
| } |
| all_layers = [x_dict] |
| for layer in self.layers: |
| x_dict = layer(x_dict, data.edge_index_dict) |
| all_layers.append(x_dict) |
| weight = 1.0 / (self.num_layers + 1) |
| return { |
| node_type: sum(weight * layer[node_type] for layer in all_layers) |
| for node_type in x_dict |
| } |
|
|
|
|
| def cos_sim(a: np.ndarray, b: np.ndarray, eps: float = 1e-12) -> np.ndarray: |
| return np.sum(a * b, axis=1) / (norm(a, axis=1) * norm(b, axis=1) + eps) |
|
|
|
|
| def build_features(data_dir: Path, num_papers: int): |
| citation = read_txt(data_dir / "paper_file_ann.txt") |
| existing_refs = read_txt(data_dir / "bipartite_train_ann.txt") |
| refs_to_pred = read_txt(data_dir / "bipartite_test_ann.txt") |
| coauthor = read_txt(data_dir / "author_file_ann.txt") |
|
|
| with (data_dir / "feature.pkl").open("rb") as f: |
| paper_feature = pkl.load(f) |
|
|
| paper_ref_deg = np.zeros(num_papers, dtype=np.float32) |
| paper_cite_out = np.zeros(num_papers, dtype=np.float32) |
| paper_cite_in = np.zeros(num_papers, dtype=np.float32) |
|
|
| for _, paper in existing_refs: |
| paper_ref_deg[paper] += 1 |
| for source, target in citation: |
| paper_cite_out[source] += 1 |
| paper_cite_in[target] += 1 |
|
|
| paper_feat_np = paper_feature.numpy().astype(np.float32) |
| paper_deg_feat = np.stack( |
| [log_norm(paper_ref_deg), log_norm(paper_cite_out), log_norm(paper_cite_in)], |
| axis=-1, |
| ) |
| paper_feat_aug = np.concatenate([paper_feat_np, paper_deg_feat], axis=-1) |
| paper_feat_aug = (paper_feat_aug - paper_feat_aug.mean(axis=0)) / ( |
| paper_feat_aug.std(axis=0) + 1e-8 |
| ) |
|
|
| return { |
| "citation": pd.DataFrame(citation, columns=["source", "target"]), |
| "existing_refs": existing_refs, |
| "refs_to_pred": refs_to_pred, |
| "coauthor": pd.DataFrame(coauthor, columns=["source", "target"]), |
| "paper_feat_aug": paper_feat_aug, |
| "ref_edges": pd.DataFrame(existing_refs, columns=["source", "target"]), |
| } |
|
|
|
|
| def build_data(parts, num_authors: int, num_papers: int, device: torch.device): |
| ref_tensor = torch.as_tensor( |
| parts["ref_edges"][["source", "target"]].to_numpy(), dtype=torch.long |
| ) |
| cite_tensor = torch.as_tensor( |
| parts["citation"][["source", "target"]].to_numpy(), dtype=torch.long |
| ) |
| coauthor_tensor = torch.as_tensor( |
| parts["coauthor"][["source", "target"]].to_numpy(), dtype=torch.long |
| ) |
|
|
| data = HeteroData() |
| data["author"].num_nodes = num_authors |
| data["paper"].num_nodes = num_papers |
| data["paper"].x = torch.as_tensor(parts["paper_feat_aug"], dtype=torch.float) |
| data["author", "ref", "paper"].edge_index = ref_tensor.t().contiguous() |
| data["paper", "beref", "author"].edge_index = ref_tensor[:, [1, 0]].t().contiguous() |
| data["paper", "cite", "paper"].edge_index = torch.cat( |
| [cite_tensor, cite_tensor[:, [1, 0]]], dim=0 |
| ).t().contiguous() |
| data["author", "coauthor", "author"].edge_index = torch.cat( |
| [coauthor_tensor, coauthor_tensor[:, [1, 0]]], dim=0 |
| ).t().contiguous() |
| return data.to(device) |
|
|
|
|
| @torch.no_grad() |
| def predict(model: LightGCN, data, pairs: np.ndarray, batch_size: int) -> np.ndarray: |
| model.eval() |
| z_dict = model.encode(data) |
| author_z = z_dict["author"].cpu().numpy() |
| paper_z = z_dict["paper"].cpu().numpy() |
| scores = [] |
| for start in range(0, len(pairs), batch_size): |
| end = min(start + batch_size, len(pairs)) |
| batch = pairs[start:end] |
| scores.append(cos_sim(author_z[batch[:, 0]], paper_z[batch[:, 1]]).astype(np.float32)) |
| return np.concatenate(scores) |
|
|
|
|
| def checkpoint_weight(path: Path) -> float: |
| name = path.name |
| if name in { |
| "model_lgcn_s0.pt", |
| "model_lgcn_s42.pt", |
| "model_lgcn_s2024.pt", |
| "model_lgcn_s10.pt", |
| "model_lgcn_s100.pt", |
| "model_lgcn_dim384_s99.pt", |
| }: |
| return 1.0 |
| if name.startswith("model_lgcn_s"): |
| return 0.8 |
| if name.startswith("model_best_"): |
| return 0.6 |
| return 0.5 |
|
|
|
|
| def percent_rank(x: np.ndarray) -> np.ndarray: |
| order = np.argsort(x, kind="mergesort") |
| ranks = np.empty_like(order, dtype=np.float32) |
| ranks[order] = np.linspace(0.0, 1.0, num=len(x), dtype=np.float32) |
| return ranks |
|
|
|
|
| def write_threshold_submissions( |
| scores: np.ndarray, |
| known_mask: np.ndarray, |
| output_dir: Path, |
| prefix: str, |
| thresholds: list[float], |
| ) -> None: |
| forced = scores.copy() |
| forced[known_mask] = 1.0 |
| for threshold in thresholds: |
| preds = (forced >= threshold).astype(np.int8) |
| out = pd.DataFrame( |
| [[idx, str(int(pred))] for idx, pred in enumerate(preds)], |
| columns=["Index", "Predicted"], |
| dtype=object, |
| ) |
| path = output_dir / f"{prefix}_t{threshold:.2f}.csv" |
| out.to_csv(path, index=False) |
| print(f"{path}: positives={int(preds.sum())} ratio={preds.mean():.6f}") |
|
|
|
|
| def write_top_ratio_submissions( |
| scores: np.ndarray, |
| known_mask: np.ndarray, |
| output_dir: Path, |
| prefix: str, |
| ratios: list[float], |
| ) -> None: |
| forced = scores.copy() |
| forced[known_mask] = np.inf |
| order = np.argsort(forced)[::-1] |
| for ratio in ratios: |
| k = int(round(len(scores) * ratio)) |
| preds = np.zeros(len(scores), dtype=np.int8) |
| preds[order[:k]] = 1 |
| out = pd.DataFrame( |
| [[idx, str(int(pred))] for idx, pred in enumerate(preds)], |
| columns=["Index", "Predicted"], |
| dtype=object, |
| ) |
| path = output_dir / f"{prefix}_r{ratio:.3f}.csv" |
| out.to_csv(path, index=False) |
| print(f"{path}: positives={int(preds.sum())} ratio={preds.mean():.6f}") |
|
|
|
|
| def main() -> None: |
| parser = argparse.ArgumentParser() |
| parser.add_argument("--package-root", type=Path, default=Path(__file__).resolve().parents[1]) |
| parser.add_argument("--device", default="cuda:0" if torch.cuda.is_available() else "cpu") |
| parser.add_argument("--batch-size", type=int, default=65536) |
| parser.add_argument("--recompute", action="store_true") |
| parser.add_argument( |
| "--thresholds", |
| nargs="*", |
| type=float, |
| default=[0.34, 0.35, 0.36, 0.37, 0.38, 0.39, 0.40], |
| ) |
| parser.add_argument( |
| "--ratios", |
| nargs="*", |
| type=float, |
| default=[0.505, 0.515, 0.521, 0.530, 0.540], |
| ) |
| args = parser.parse_args() |
|
|
| root = args.package_root |
| data_dir = root / "data_and_docs" |
| checkpoint_dir = root / "checkpoints" / "extra_models" |
| score_dir = root / "cached_scores" / "large_ensemble" |
| output_dir = root / "submissions" / "large_ensemble" |
| score_dir.mkdir(parents=True, exist_ok=True) |
| output_dir.mkdir(parents=True, exist_ok=True) |
|
|
| device = torch.device(args.device) |
| num_authors = 6611 |
| num_papers = 79937 |
|
|
| parts = build_features(data_dir, num_papers) |
| data = build_data(parts, num_authors, num_papers, device) |
| test_arr = np.array(parts["refs_to_pred"], dtype=np.int64) |
| train_set = set(map(tuple, parts["existing_refs"])) |
| known_mask = np.array([tuple(pair) in train_set for pair in parts["refs_to_pred"]]) |
| print(f"known positives: {known_mask.sum()} / {len(known_mask)}") |
|
|
| checkpoints = sorted(checkpoint_dir.glob("*.pt")) |
| model_scores = [] |
| weights = [] |
| for path in checkpoints: |
| cache_path = score_dir / f"{path.stem}.npy" |
| if cache_path.exists() and not args.recompute: |
| scores = np.load(cache_path).astype(np.float32) |
| print(f"{path.name}: loaded cached scores") |
| else: |
| state = torch.load(path, map_location=device) |
| embed_dim = state["author_emb.weight"].shape[1] |
| model = LightGCN(num_authors, parts["paper_feat_aug"].shape[1], embed_dim).to(device) |
| model.load_state_dict(state) |
| scores = predict(model, data, test_arr, args.batch_size) |
| np.save(cache_path, scores) |
| print(f"{path.name}: computed scores") |
| del model |
| if device.type == "cuda": |
| torch.cuda.empty_cache() |
| print(f" mean={scores.mean():.6f} std={scores.std():.6f} weight={checkpoint_weight(path):.2f}") |
| model_scores.append(scores) |
| weights.append(checkpoint_weight(path)) |
|
|
| score_stack = np.vstack(model_scores).astype(np.float32) |
| weights_np = np.array(weights, dtype=np.float32) |
| lgcn14_mean = score_stack.mean(axis=0) |
| lgcn14_weighted = np.average(score_stack, axis=0, weights=weights_np).astype(np.float32) |
|
|
| np.save(score_dir / "lgcn14_mean.npy", lgcn14_mean) |
| np.save(score_dir / "lgcn14_weighted.npy", lgcn14_weighted) |
|
|
| write_threshold_submissions(lgcn14_mean, known_mask, output_dir, "sub_lgcn14_mean", args.thresholds) |
| write_threshold_submissions( |
| lgcn14_weighted, known_mask, output_dir, "sub_lgcn14_weighted", args.thresholds |
| ) |
|
|
| cached_dir = root / "cached_scores" |
| cached_components = { |
| "bpr_cos": np.load(cached_dir / "test_bpr_cos.npy").astype(np.float32), |
| "bpr_dot": np.load(cached_dir / "test_bpr_dot.npy").astype(np.float32), |
| "lgb": np.load(cached_dir / "test_lgb_scores.npy").astype(np.float32), |
| "lgb_v2": np.load(cached_dir / "test_lgb_v2_scores.npy").astype(np.float32), |
| } |
| rank_blend = 0.74 * percent_rank(lgcn14_weighted) |
| rank_blend += 0.10 * percent_rank(cached_components["bpr_cos"]) |
| rank_blend += 0.06 * percent_rank(cached_components["bpr_dot"]) |
| rank_blend += 0.05 * percent_rank(cached_components["lgb"]) |
| rank_blend += 0.05 * percent_rank(cached_components["lgb_v2"]) |
| rank_blend = rank_blend.astype(np.float32) |
| np.save(score_dir / "rank_blend_lgcn14_bpr_lgb.npy", rank_blend) |
| write_top_ratio_submissions(rank_blend, known_mask, output_dir, "sub_rankblend_lgcn14_bpr_lgb", args.ratios) |
|
|
|
|
| if __name__ == "__main__": |
| main() |
|
|