| """Generate full-test dot-score ensemble submissions from saved full checkpoints.""" |
|
|
| from __future__ import annotations |
|
|
| import argparse |
| import importlib.util |
| import pickle as pkl |
| from pathlib import Path |
|
|
| import numpy as np |
| import pandas as pd |
| import torch |
| import torch.nn as nn |
| from torch_geometric.data import HeteroData |
|
|
|
|
| EDGE_TYPES = [ |
| ("author", "ref", "paper"), |
| ("paper", "beref", "author"), |
| ("paper", "cite", "paper"), |
| ("author", "coauthor", "author"), |
| ] |
|
|
|
|
| def read_txt(path: Path): |
| return [list(map(int, line.strip().split())) for line in path.open()] |
|
|
|
|
| def log_norm(x): |
| x = np.log1p(x) |
| return (x - x.mean()) / (x.std() + 1e-8) |
|
|
|
|
| class LightGCNLayer(nn.Module): |
| def forward(self, x_dict, edge_index_dict): |
| agg_dict = {node_type: [] for node_type in x_dict} |
| for et in EDGE_TYPES: |
| if et not in edge_index_dict: |
| continue |
| st, _, dt = et |
| src, dst = edge_index_dict[et] |
| sx = x_dict[st] |
| agg = sx.new_zeros((x_dict[dt].size(0), sx.size(-1))) |
| deg = sx.new_zeros((x_dict[dt].size(0), 1)) |
| agg.index_add_(0, dst, sx[src]) |
| deg.index_add_(0, dst, torch.ones((dst.numel(), 1), dtype=sx.dtype, device=sx.device)) |
| agg_dict[dt].append(agg / deg.clamp(min=1.0)) |
| return {nt: sum(v) / len(v) if v else x_dict[nt] for nt, v in agg_dict.items()} |
|
|
|
|
| class LightGCN(nn.Module): |
| def __init__(self, n_author, feat_dim, dim, layers=4): |
| super().__init__() |
| self.author_emb = nn.Embedding(n_author, dim) |
| self.paper_proj = nn.Linear(feat_dim, dim) |
| self.layers = nn.ModuleList([LightGCNLayer() for _ in range(layers)]) |
| self.num_layers = layers |
|
|
| def encode(self, data): |
| x = {"author": self.author_emb.weight, "paper": self.paper_proj(data["paper"].x)} |
| all_x = [x] |
| for layer in self.layers: |
| x = layer(x, data.edge_index_dict) |
| all_x.append(x) |
| w = 1.0 / len(all_x) |
| return {nt: sum(w * xx[nt] for xx in all_x) for nt in x} |
|
|
|
|
| def build(root: Path, device): |
| data_dir = root / "data_and_docs" |
| refs = read_txt(data_dir / "bipartite_train_ann.txt") |
| test = read_txt(data_dir / "bipartite_test_ann.txt") |
| cite = read_txt(data_dir / "paper_file_ann.txt") |
| coa = read_txt(data_dir / "author_file_ann.txt") |
| with (data_dir / "feature.pkl").open("rb") as f: |
| feat = pkl.load(f).numpy().astype(np.float32) |
| n_paper = 79937 |
| ref_deg = np.zeros(n_paper, np.float32) |
| cout = np.zeros(n_paper, np.float32) |
| cin = np.zeros(n_paper, np.float32) |
| for _, p in refs: |
| ref_deg[p] += 1 |
| for s, t in cite: |
| cout[s] += 1 |
| cin[t] += 1 |
| deg = np.stack([log_norm(ref_deg), log_norm(cout), log_norm(cin)], axis=-1) |
| paper_x = np.concatenate([feat, deg], axis=1) |
| paper_x = (paper_x - paper_x.mean(0)) / (paper_x.std(0) + 1e-8) |
|
|
| rt = torch.as_tensor(np.array(refs), dtype=torch.long) |
| ct = torch.as_tensor(np.array(cite), dtype=torch.long) |
| co = torch.as_tensor(np.array(coa), dtype=torch.long) |
| data = HeteroData() |
| data["author"].num_nodes = 6611 |
| data["paper"].num_nodes = n_paper |
| data["paper"].x = torch.as_tensor(paper_x, dtype=torch.float) |
| data["author", "ref", "paper"].edge_index = rt.t().contiguous() |
| data["paper", "beref", "author"].edge_index = rt[:, [1, 0]].t().contiguous() |
| data["paper", "cite", "paper"].edge_index = torch.cat([ct, ct[:, [1, 0]]], 0).t().contiguous() |
| data["author", "coauthor", "author"].edge_index = torch.cat([co, co[:, [1, 0]]], 0).t().contiguous() |
| return data.to(device), np.array(test, dtype=np.int64), refs, paper_x.shape[1] |
|
|
|
|
| @torch.no_grad() |
| def predict_dot(model, data, pairs, batch_size): |
| z = model.encode(data) |
| a = z["author"].detach().cpu().numpy() |
| p = z["paper"].detach().cpu().numpy() |
| out = [] |
| for st in range(0, len(pairs), batch_size): |
| b = pairs[st : st + batch_size] |
| out.append(np.sum(a[b[:, 0]] * p[b[:, 1]], axis=1).astype(np.float32)) |
| return np.concatenate(out) |
|
|
|
|
| def rank01(x): |
| order = np.argsort(x, kind="mergesort") |
| r = np.empty(len(x), dtype=np.float32) |
| r[order] = np.linspace(0, 1, len(x), dtype=np.float32) |
| return r |
|
|
|
|
| def write(scores, known, out_dir, prefix, ratios): |
| forced = scores.copy() |
| forced[known] = np.inf |
| order = np.argsort(forced)[::-1] |
| for ratio in ratios: |
| k = int(round(len(scores) * ratio)) |
| pred = np.zeros(len(scores), dtype=np.int8) |
| pred[order[:k]] = 1 |
| df = pd.DataFrame({"Index": np.arange(len(pred)), "Predicted": pred.astype(str)}) |
| path = out_dir / f"{prefix}_r{ratio:.3f}.csv" |
| df.to_csv(path, index=False) |
| print(path, int(pred.sum()), float(pred.mean())) |
|
|
|
|
| def main(): |
| parser = argparse.ArgumentParser() |
| parser.add_argument("--package-root", type=Path, default=Path(__file__).resolve().parents[1]) |
| parser.add_argument("--device", default="cuda:0" if torch.cuda.is_available() else "cpu") |
| parser.add_argument("--batch-size", type=int, default=65536) |
| parser.add_argument("--ratios", nargs="*", type=float, default=[0.505, 0.515, 0.521, 0.530, 0.540]) |
| args = parser.parse_args() |
| root = args.package_root |
| device = torch.device(args.device) |
| data, pairs, refs, feat_dim = build(root, device) |
| train_set = set(map(tuple, refs)) |
| known = np.array([tuple(x) in train_set for x in pairs]) |
| ckpts = [ |
| "model_best_s0_d512.pt", |
| "model_best_s0_d384.pt", |
| "model_lgcn_s23.pt", |
| "model_lgcn_s0.pt", |
| "model_lgcn_s100.pt", |
| "model_lgcn_s77.pt", |
| "model_lgcn_s42.pt", |
| "model_lgcn_s2024.pt", |
| ] |
| score_dir = root / "cached_scores" / "dot_full" |
| out_dir = root / "submissions" / "dot_full" |
| score_dir.mkdir(parents=True, exist_ok=True) |
| out_dir.mkdir(parents=True, exist_ok=True) |
| scores = [] |
| for name in ckpts: |
| path = root / "checkpoints" / "extra_models" / name |
| cache = score_dir / f"{path.stem}_dot.npy" |
| if cache.exists(): |
| s = np.load(cache) |
| else: |
| state = torch.load(path, map_location=device) |
| dim = state["author_emb.weight"].shape[1] |
| model = LightGCN(6611, feat_dim, dim, 4).to(device) |
| model.load_state_dict(state) |
| s = predict_dot(model, data, pairs, args.batch_size) |
| np.save(cache, s) |
| del model |
| torch.cuda.empty_cache() |
| print(name, s.mean(), s.std()) |
| scores.append(s) |
| zmean = np.mean([(s - s.mean()) / (s.std() + 1e-8) for s in scores[:4]], axis=0) |
| rmean = np.mean([rank01(s) for s in scores[:4]], axis=0) |
| write(zmean, known, out_dir, "sub_dot_top4_z", args.ratios) |
| write(rmean, known, out_dir, "sub_dot_top4_rank", args.ratios) |
|
|
|
|
| if __name__ == "__main__": |
| main() |
|
|