| """Ultimate version for best possible score. |
| |
| 1. LightGCN-style model trained longer on full data |
| 2. Also train V2-style (SAGEConv + BPR) models on full data |
| 3. Ensemble both architectures |
| 4. Multiple threshold options |
| """ |
| import os |
| import pickle as pkl |
| import random |
|
|
| import numpy as np |
| import pandas as pd |
| import torch |
| import torch.nn as nn |
| import torch.nn.functional as F |
|
|
| from torch_geometric.data import HeteroData |
| from torch_geometric.nn import SAGEConv, HeteroConv |
| from sklearn.metrics import precision_recall_curve, roc_auc_score |
| from numpy.linalg import norm |
|
|
| device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu') |
| print('device:', device) |
|
|
|
|
| def set_seed(seed=0): |
| random.seed(seed) |
| np.random.seed(seed) |
| torch.manual_seed(seed) |
| if torch.cuda.is_available(): |
| torch.cuda.manual_seed_all(seed) |
|
|
|
|
| |
| base_path = "/home/lzc/cs3319-project" |
|
|
|
|
| def read_txt(file): |
| res_list = [] |
| with open(file, "r") as f: |
| for line in f: |
| res_list.append(list(map(int, line.strip().split()))) |
| return res_list |
|
|
|
|
| citation = read_txt(os.path.join(base_path, "paper_file_ann.txt")) |
| existing_refs = read_txt(os.path.join(base_path, "bipartite_train_ann.txt")) |
| refs_to_pred = read_txt(os.path.join(base_path, "bipartite_test_ann.txt")) |
| coauthor = read_txt(os.path.join(base_path, "author_file_ann.txt")) |
|
|
| with open(os.path.join(base_path, "feature.pkl"), 'rb') as f: |
| paper_feature = pkl.load(f) |
|
|
| train_set = set(map(tuple, existing_refs)) |
| overlap = train_set & set(map(tuple, refs_to_pred)) |
| print(f"Known positives: {len(overlap)}") |
|
|
| |
| cite_edges = pd.DataFrame(citation, columns=['source', 'target']) |
| ref_edges = pd.DataFrame(existing_refs, columns=['source', 'target']) |
| coauthor_edges = pd.DataFrame(coauthor, columns=['source', 'target']) |
|
|
| node_tmp = pd.concat([cite_edges['source'], cite_edges['target'], ref_edges['target']]) |
| node_papers = pd.DataFrame(index=pd.unique(node_tmp)) |
| node_tmp = pd.concat([ref_edges['source'], coauthor_edges['source'], coauthor_edges['target']]) |
| node_authors = pd.DataFrame(index=pd.unique(node_tmp)) |
| num_authors = len(node_authors) |
| num_papers = len(node_papers) |
| print(f"Nodes: {num_authors} authors, {num_papers} papers") |
|
|
| |
| author_ref_deg = np.zeros(num_authors, dtype=np.float32) |
| paper_ref_deg = np.zeros(num_papers, dtype=np.float32) |
| paper_cite_out = np.zeros(num_papers, dtype=np.float32) |
| paper_cite_in = np.zeros(num_papers, dtype=np.float32) |
| for s, t in existing_refs: |
| author_ref_deg[s] += 1 |
| paper_ref_deg[t] += 1 |
| for s, t in citation: |
| paper_cite_out[s] += 1 |
| paper_cite_in[t] += 1 |
|
|
|
|
| def log_norm(x): |
| x = np.log1p(x) |
| return (x - x.mean()) / (x.std() + 1e-8) |
|
|
|
|
| paper_feat_np = paper_feature.numpy().astype(np.float32) |
| paper_deg_feat = np.stack([log_norm(paper_ref_deg), log_norm(paper_cite_out), |
| log_norm(paper_cite_in)], axis=-1) |
| paper_feat_aug = np.concatenate([paper_feat_np, paper_deg_feat], axis=-1) |
| |
| paper_feat_aug = (paper_feat_aug - paper_feat_aug.mean(axis=0)) / (paper_feat_aug.std(axis=0) + 1e-8) |
|
|
| |
| popular_threshold = np.percentile(paper_ref_deg[paper_ref_deg > 0], 70) |
| popular_papers = np.where(paper_ref_deg >= popular_threshold)[0] |
| coauthor_map = {i: set() for i in range(num_authors)} |
| for s, t in coauthor: |
| coauthor_map[s].add(t) |
| coauthor_map[t].add(s) |
| author_papers = {i: set() for i in range(num_authors)} |
| for s, t in existing_refs: |
| author_papers[s].add(t) |
| coauthor_paper_pool = {} |
| for a in range(num_authors): |
| pool = set() |
| for c in coauthor_map[a]: |
| pool.update(author_papers[c]) |
| pool -= author_papers[a] |
| coauthor_paper_pool[a] = list(pool) if pool else list(range(num_papers)) |
|
|
| existing_ref_set = set(map(tuple, existing_refs)) |
|
|
|
|
| |
| def build_data(ref_edges_use): |
| ref_tensor = torch.as_tensor(ref_edges_use[['source', 'target']].to_numpy(), dtype=torch.long) |
| cite_tensor = torch.as_tensor(cite_edges[['source', 'target']].to_numpy(), dtype=torch.long) |
| coauthor_tensor = torch.as_tensor(coauthor_edges[['source', 'target']].to_numpy(), dtype=torch.long) |
|
|
| d = HeteroData() |
| d['author'].num_nodes = num_authors |
| d['paper'].num_nodes = num_papers |
| d['paper'].x = torch.as_tensor(paper_feat_aug, dtype=torch.float) |
| d['author', 'ref', 'paper'].edge_index = ref_tensor.t().contiguous() |
| d['paper', 'beref', 'author'].edge_index = ref_tensor[:, [1, 0]].t().contiguous() |
| d['paper', 'cite', 'paper'].edge_index = torch.cat([ |
| cite_tensor, cite_tensor[:, [1, 0]], |
| ], dim=0).t().contiguous() |
| d['author', 'coauthor', 'author'].edge_index = torch.cat([ |
| coauthor_tensor, coauthor_tensor[:, [1, 0]], |
| ], dim=0).t().contiguous() |
| return d.to(device) |
|
|
|
|
| def sample_hard_negatives(n_samples): |
| neg_list = [] |
|
|
| def add_random(target): |
| nonlocal neg_list |
| while len(neg_list) < target: |
| s = np.random.randint(0, num_authors) |
| d = np.random.randint(0, num_papers) |
| if (s, d) not in existing_ref_set: |
| neg_list.append((s, d)) |
|
|
| add_random(int(n_samples * 0.5)) |
| cnt = 0 |
| while len(neg_list) < int(n_samples * 0.75) and cnt < n_samples * 2: |
| cnt += 1 |
| s = np.random.randint(0, num_authors) |
| d = popular_papers[np.random.randint(0, len(popular_papers))] |
| if (s, d) not in existing_ref_set: |
| neg_list.append((s, d)) |
| cnt = 0 |
| while len(neg_list) < n_samples and cnt < n_samples * 3: |
| cnt += 1 |
| s = np.random.randint(0, num_authors) |
| pool = coauthor_paper_pool.get(s, []) |
| if pool: |
| d = pool[np.random.randint(0, len(pool))] |
| if (s, d) not in existing_ref_set: |
| neg_list.append((s, d)) |
| add_random(n_samples) |
| return torch.tensor(neg_list[:n_samples], dtype=torch.long, device=device).t().contiguous() |
|
|
|
|
| def cos_sim(a, b, eps=1e-12): |
| return np.sum(a * b, axis=1) / (norm(a, axis=1) * norm(b, axis=1) + eps) |
|
|
|
|
| |
| |
| |
|
|
| class LightGCNLayer(nn.Module): |
| def __init__(self): |
| super().__init__() |
| self.edge_types_used = [ |
| ('author', 'ref', 'paper'), ('paper', 'beref', 'author'), |
| ('paper', 'cite', 'paper'), ('author', 'coauthor', 'author'), |
| ] |
|
|
| def forward(self, x_dict, edge_index_dict): |
| out_dict = {} |
| agg_dict = {nt: [] for nt in x_dict} |
| for et in self.edge_types_used: |
| if et not in edge_index_dict: |
| continue |
| src_type, _, dst_type = et |
| src, dst = edge_index_dict[et] |
| src_x = x_dict[src_type] |
| agg = src_x.new_zeros((x_dict[dst_type].size(0), src_x.size(-1))) |
| deg = src_x.new_zeros((x_dict[dst_type].size(0), 1)) |
| agg.index_add_(0, dst, src_x[src]) |
| deg.index_add_(0, dst, torch.ones( |
| (dst.numel(), 1), dtype=src_x.dtype, device=src_x.device)) |
| agg = agg / deg.clamp(min=1.0) |
| agg_dict[dst_type].append(agg) |
| for nt in x_dict: |
| if agg_dict[nt]: |
| out_dict[nt] = sum(agg_dict[nt]) / len(agg_dict[nt]) |
| else: |
| out_dict[nt] = x_dict[nt] |
| return out_dict |
|
|
|
|
| class LightGCNRecommender(nn.Module): |
| def __init__(self, embed_dim=256, num_layers=4): |
| super().__init__() |
| self.author_emb = nn.Embedding(num_authors, embed_dim) |
| self.paper_proj = nn.Linear(paper_feat_aug.shape[1], embed_dim) |
| self.layers = nn.ModuleList([LightGCNLayer() for _ in range(num_layers)]) |
| self.num_layers = num_layers |
| self.reset_parameters() |
|
|
| def reset_parameters(self): |
| nn.init.xavier_uniform_(self.author_emb.weight) |
| nn.init.xavier_uniform_(self.paper_proj.weight) |
| nn.init.zeros_(self.paper_proj.bias) |
|
|
| def encode(self, data): |
| x_dict = { |
| 'author': self.author_emb.weight, |
| 'paper': self.paper_proj(data['paper'].x), |
| } |
| all_layers = [x_dict] |
| for layer in self.layers: |
| x_dict = layer(x_dict, data.edge_index_dict) |
| all_layers.append(x_dict) |
| weights = 1.0 / (self.num_layers + 1) |
| return { |
| nt: sum(weights * l[nt] for l in all_layers) |
| for nt in x_dict |
| } |
|
|
| def decode(self, z_dict, edge_index): |
| src, dst = edge_index |
| return (z_dict['author'][src] * z_dict['paper'][dst]).sum(dim=-1) |
|
|
|
|
| |
| |
| |
|
|
| class ResidualHeteroConv(nn.Module): |
| def __init__(self, hidden_dim, dropout=0.2): |
| super().__init__() |
| edge_types_used = [ |
| ('author', 'ref', 'paper'), ('paper', 'beref', 'author'), |
| ('paper', 'cite', 'paper'), ('author', 'coauthor', 'author'), |
| ] |
| conv_dict = {et: SAGEConv(hidden_dim, hidden_dim) for et in edge_types_used} |
| self.conv = HeteroConv(conv_dict, aggr='mean') |
| self.norms = nn.ModuleDict({ |
| 'author': nn.LayerNorm(hidden_dim), |
| 'paper': nn.LayerNorm(hidden_dim), |
| }) |
| self.dropout = nn.Dropout(dropout) |
|
|
| def forward(self, x_dict, edge_index_dict): |
| h = self.conv(x_dict, edge_index_dict) |
| return { |
| nt: self.dropout(F.relu(self.norms[nt](h[nt] + x_dict[nt]))) |
| for nt in h |
| } |
|
|
|
|
| class SAGERecommender(nn.Module): |
| def __init__(self, hidden_dim=128, num_layers=3): |
| super().__init__() |
| self.author_emb = nn.Embedding(num_authors, hidden_dim) |
| self.paper_proj = nn.Linear(paper_feat_aug.shape[1], hidden_dim) |
| self.convs = nn.ModuleList( |
| [ResidualHeteroConv(hidden_dim) for _ in range(num_layers)]) |
| self.reset_parameters() |
|
|
| def reset_parameters(self): |
| nn.init.xavier_uniform_(self.author_emb.weight) |
| nn.init.xavier_uniform_(self.paper_proj.weight) |
| nn.init.zeros_(self.paper_proj.bias) |
|
|
| def encode(self, data): |
| x_dict = { |
| 'author': self.author_emb.weight, |
| 'paper': self.paper_proj(data['paper'].x), |
| } |
| for conv in self.convs: |
| x_dict = conv(x_dict, data.edge_index_dict) |
| return x_dict |
|
|
| def decode(self, z_dict, edge_index): |
| src, dst = edge_index |
| return (z_dict['author'][src] * z_dict['paper'][dst]).sum(dim=-1) |
|
|
|
|
| |
| @torch.no_grad() |
| def predict_cos_batched(model, data, pairs, batch_size=65536): |
| model.eval() |
| z_dict = model.encode(data) |
| z_cpu = {k: v.cpu() for k, v in z_dict.items()} |
| all_scores = [] |
| for start in range(0, len(pairs), batch_size): |
| end = min(start + batch_size, len(pairs)) |
| batch = pairs[start:end] |
| scores = cos_sim( |
| z_cpu['author'][batch[:, 0]].numpy(), |
| z_cpu['paper'][batch[:, 1]].numpy(), |
| ) |
| all_scores.append(scores) |
| return np.concatenate(all_scores) |
|
|
|
|
| |
| def train_lightgcn(seed, embed_dim=256, num_layers=4, |
| lr=0.005, num_epochs=200): |
| set_seed(seed) |
| data_local = build_data(ref_edges) |
| model = LightGCNRecommender(embed_dim, num_layers).to(device) |
|
|
| optimizer = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=1e-5) |
| pos_edges = data_local['author', 'ref', 'paper'].edge_index |
| batch_size = min(32768, pos_edges.size(1)) |
|
|
| for epoch in range(num_epochs): |
| model.train() |
| perm = torch.randperm(pos_edges.size(1), device=device)[:batch_size] |
| pos_batch = pos_edges[:, perm] |
| neg_batch = sample_hard_negatives(pos_batch.size(1) * 2) |
|
|
| z_dict = model.encode(data_local) |
| pos_score = model.decode(z_dict, pos_batch).repeat_interleave(2) |
| neg_score = model.decode(z_dict, neg_batch) |
| loss = -F.logsigmoid(pos_score - neg_score).mean() |
|
|
| optimizer.zero_grad() |
| loss.backward() |
| torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0) |
| optimizer.step() |
|
|
| if epoch % 50 == 0 or epoch == num_epochs - 1: |
| print(f' LGCN seed={seed} epoch {epoch:03d} loss={loss.item():.4f}') |
|
|
| return model.cpu(), data_local |
|
|
|
|
| def train_sage(seed, hidden_dim=128, num_layers=3, |
| lr=0.005, num_epochs=200): |
| set_seed(seed) |
| data_local = build_data(ref_edges) |
| model = SAGERecommender(hidden_dim, num_layers).to(device) |
|
|
| optimizer = torch.optim.AdamW(model.parameters(), lr=lr, weight_decay=1e-4) |
| pos_edges = data_local['author', 'ref', 'paper'].edge_index |
| batch_size = min(32768, pos_edges.size(1)) |
|
|
| for epoch in range(num_epochs): |
| model.train() |
| perm = torch.randperm(pos_edges.size(1), device=device)[:batch_size] |
| pos_batch = pos_edges[:, perm] |
| neg_batch = sample_hard_negatives(pos_batch.size(1) * 2) |
|
|
| z_dict = model.encode(data_local) |
| pos_score = model.decode(z_dict, pos_batch).repeat_interleave(2) |
| neg_score = model.decode(z_dict, neg_batch) |
| loss = -F.logsigmoid(pos_score - neg_score).mean() |
|
|
| optimizer.zero_grad() |
| loss.backward() |
| torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0) |
| optimizer.step() |
|
|
| if epoch % 50 == 0 or epoch == num_epochs - 1: |
| print(f' SAGE seed={seed} epoch {epoch:03d} loss={loss.item():.4f}') |
|
|
| return model.cpu(), data_local |
|
|
|
|
| |
| print("\n" + "=" * 60) |
| print("Training LightGCN models (full data, 200 epochs)") |
| print("=" * 60) |
|
|
| lgcn_models = [] |
| for seed in [0, 42, 2024]: |
| print(f"\n[LightGCN seed={seed}]") |
| m, d = train_lightgcn(seed, embed_dim=256, num_layers=4, num_epochs=150) |
| lgcn_models.append(m) |
|
|
| print("\n" + "=" * 60) |
| print("Training SAGEConv models (full data, 150 epochs)") |
| print("=" * 60) |
|
|
| sage_models = [] |
| for seed in [0, 42]: |
| print(f"\n[SAGE seed={seed}]") |
| m, d = train_sage(seed, hidden_dim=128, num_layers=3, num_epochs=150) |
| sage_models.append(m) |
|
|
| |
| print("\n" + "=" * 60) |
| print("Generating predictions...") |
| print("=" * 60) |
|
|
| test_arr = np.array(refs_to_pred, dtype=np.int64) |
| data_full = build_data(ref_edges) |
| all_model_scores = [] |
|
|
| |
| for i, model in enumerate(lgcn_models): |
| model = model.to(device) |
| scores = predict_cos_batched(model, data_full, test_arr) |
| all_model_scores.append(scores) |
| model = model.cpu() |
| print(f" LGCN-{i}: mean={scores.mean():.4f}") |
|
|
| |
| for i, model in enumerate(sage_models): |
| model = model.to(device) |
| scores = predict_cos_batched(model, data_full, test_arr) |
| all_model_scores.append(scores) |
| model = model.cpu() |
| print(f" SAGE-{i}: mean={scores.mean():.4f}") |
|
|
| |
| ensemble_scores = np.mean(all_model_scores, axis=0) |
|
|
| |
| known_pos_mask = np.array([tuple(p) in overlap for p in refs_to_pred]) |
| ensemble_scores[known_pos_mask] = 1.0 |
|
|
| |
| thresholds_to_try = [0.30, 0.35, 0.40, 0.45, 0.50, 0.55] |
| for thresh in thresholds_to_try: |
| predictions = (ensemble_scores >= thresh).astype(int) |
| pos_ratio = predictions.mean() |
| extra_pos = predictions.sum() - known_pos_mask.sum() |
|
|
| output_path = f"/home/lzc/submission_t{thresh:.2f}.csv" |
| data_out = [[idx, str(int(p))] for idx, p in enumerate(predictions)] |
| pd.DataFrame(data_out, columns=['Index', 'Predicted'], dtype=object).to_csv( |
| output_path, index=False) |
| print(f" thresh={thresh:.2f}: pos_ratio={pos_ratio:.4f} " |
| f"({predictions.sum()}/{len(predictions)}), extra_pos={extra_pos}") |
|
|
| print("\nDone! Try different thresholds on the leaderboard.") |
|
|