| """Adapted from project-example-2026-pygver.ipynb for local execution.""" |
| import os |
| import pickle as pkl |
| import random |
|
|
| import numpy as np |
| import pandas as pd |
| import torch |
| import torch.nn as nn |
| import torch.nn.functional as F |
| import tqdm |
|
|
| from torch_geometric.data import HeteroData |
| from sklearn.metrics import f1_score, precision_recall_curve |
|
|
| device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu') |
| print('torch:', torch.__version__) |
| print('device:', device) |
|
|
|
|
| def set_seed(seed=0): |
| random.seed(seed) |
| np.random.seed(seed) |
| torch.manual_seed(seed) |
| if torch.cuda.is_available(): |
| torch.cuda.manual_seed_all(seed) |
|
|
|
|
| set_seed(0) |
|
|
| |
| base_path = "/home/lzc/cs3319-project" |
|
|
| cite_file = os.path.join(base_path, "paper_file_ann.txt") |
| train_ref_file = os.path.join(base_path, "bipartite_train_ann.txt") |
| test_ref_file = os.path.join(base_path, "bipartite_test_ann.txt") |
| coauthor_file = os.path.join(base_path, "author_file_ann.txt") |
| feature_file = os.path.join(base_path, "feature.pkl") |
|
|
|
|
| def read_txt(file): |
| res_list = [] |
| with open(file, "r") as f: |
| for line in f: |
| res_list.append(list(map(int, line.strip().split()))) |
| return res_list |
|
|
|
|
| citation = read_txt(cite_file) |
| existing_refs = read_txt(train_ref_file) |
| refs_to_pred = read_txt(test_ref_file) |
| coauthor = read_txt(coauthor_file) |
|
|
| with open(feature_file, 'rb') as f: |
| paper_feature = pkl.load(f) |
|
|
| print(f"Number of citation edges: {len(citation)}") |
| print(f"Number of existing references: {len(existing_refs)}") |
| print(f"Number of author-paper pairs to predict: {len(refs_to_pred)}") |
| print(f"Number of coauthor edges: {len(coauthor)}") |
| print(f"Shape of paper features: {paper_feature.shape}") |
|
|
| |
| cite_edges = pd.DataFrame(citation, columns=['source', 'target']) |
| cite_edges = cite_edges.set_index("c-" + cite_edges.index.astype(str)) |
|
|
| ref_edges = pd.DataFrame(existing_refs, columns=['source', 'target']) |
| ref_edges = ref_edges.set_index("r-" + ref_edges.index.astype(str)) |
|
|
| coauthor_edges = pd.DataFrame(coauthor, columns=['source', 'target']) |
| coauthor_edges = coauthor_edges.set_index("a-" + coauthor_edges.index.astype(str)) |
|
|
| |
| node_tmp = pd.concat([cite_edges['source'], cite_edges['target'], ref_edges['target']]) |
| node_papers = pd.DataFrame(index=pd.unique(node_tmp)) |
|
|
| node_tmp = pd.concat([ref_edges['source'], coauthor_edges['source'], coauthor_edges['target']]) |
| node_authors = pd.DataFrame(index=pd.unique(node_tmp)) |
|
|
| print(f"Number of paper nodes: {len(node_papers)}, number of author nodes: {len(node_authors)}") |
|
|
| |
| train_refs = ref_edges.sample(frac=0.9, random_state=0, axis=0) |
| test_true_refs = ref_edges[~ref_edges.index.isin(train_refs.index)].copy() |
| test_true_refs['label'] = 1 |
|
|
| existing_ref_set = set(map(tuple, ref_edges[['source', 'target']].to_numpy().tolist())) |
| num_test_pos = len(test_true_refs) |
| author_ids = node_authors.index.to_numpy(dtype=np.int64) |
| paper_ids = node_papers.index.to_numpy(dtype=np.int64) |
|
|
| neg_pairs = [] |
| rng = np.random.default_rng(0) |
| while len(neg_pairs) < num_test_pos: |
| src = int(rng.choice(author_ids)) |
| dst = int(rng.choice(paper_ids)) |
| if (src, dst) not in existing_ref_set: |
| neg_pairs.append((src, dst)) |
|
|
| test_false_refs = pd.DataFrame(neg_pairs, columns=['source', 'target']) |
| test_false_refs['label'] = 0 |
|
|
| test_refs = pd.concat([test_true_refs, test_false_refs], ignore_index=True) |
| test_refs = test_refs.sample(frac=1, random_state=0, axis=0).reset_index(drop=True) |
| print(f"Validation set: {len(test_refs)} pairs " |
| f"(pos={test_refs['label'].sum()}, neg={len(test_refs) - test_refs['label'].sum()})") |
|
|
| |
| train_ref_tensor = torch.as_tensor(train_refs[['source', 'target']].to_numpy(), dtype=torch.long) |
| cite_tensor = torch.as_tensor(cite_edges[['source', 'target']].to_numpy(), dtype=torch.long) |
| coauthor_tensor = torch.as_tensor(coauthor_edges[['source', 'target']].to_numpy(), dtype=torch.long) |
|
|
| test_ref_arr = np.array(refs_to_pred, dtype=np.int64) if len(refs_to_pred) > 0 else np.zeros((0, 2), dtype=np.int64) |
| num_authors = int(max( |
| ref_edges['source'].max(), |
| coauthor_edges['source'].max(), |
| coauthor_edges['target'].max(), |
| test_ref_arr[:, 0].max() if len(test_ref_arr) else 0, |
| ) + 1) |
| num_papers = int(max( |
| cite_edges['source'].max(), |
| cite_edges['target'].max(), |
| ref_edges['target'].max(), |
| test_ref_arr[:, 1].max() if len(test_ref_arr) else 0, |
| paper_feature.shape[0] - 1, |
| ) + 1) |
|
|
| paper_x = torch.as_tensor(paper_feature, dtype=torch.float) |
| if paper_x.size(0) < num_papers: |
| pad = torch.zeros(num_papers - paper_x.size(0), paper_x.size(1), dtype=paper_x.dtype) |
| paper_x = torch.cat([paper_x, pad], dim=0) |
| elif paper_x.size(0) > num_papers: |
| paper_x = paper_x[:num_papers] |
|
|
| data = HeteroData() |
| data['author'].num_nodes = num_authors |
| data['paper'].x = paper_x |
| data['paper'].num_nodes = num_papers |
| data['author', 'ref', 'paper'].edge_index = train_ref_tensor.t().contiguous() |
| data['paper', 'beref', 'author'].edge_index = train_ref_tensor[:, [1, 0]].t().contiguous() |
| data['paper', 'cite', 'paper'].edge_index = torch.cat([ |
| cite_tensor, |
| cite_tensor[:, [1, 0]], |
| ], dim=0).t().contiguous() |
| data['author', 'coauthor', 'author'].edge_index = torch.cat([ |
| coauthor_tensor, |
| coauthor_tensor[:, [1, 0]], |
| ], dim=0).t().contiguous() |
|
|
| data = data.to(device) |
| print(data) |
| print('metadata:', data.metadata()) |
|
|
|
|
| |
| class HeteroMeanConv(nn.Module): |
| def __init__(self, metadata, in_dims, out_dim): |
| super().__init__() |
| node_types, edge_types = metadata |
| self.node_types = list(node_types) |
| self.edge_types = list(edge_types) |
| self.rel_lins = nn.ModuleDict({ |
| self._key(edge_type): nn.Linear(in_dims[edge_type[0]], out_dim, bias=False) |
| for edge_type in self.edge_types |
| }) |
| self.self_lins = nn.ModuleDict({ |
| node_type: nn.Linear(in_dims[node_type], out_dim) |
| for node_type in self.node_types |
| }) |
|
|
| @staticmethod |
| def _key(edge_type): |
| return '__'.join(edge_type) |
|
|
| def reset_parameters(self): |
| for layer in self.rel_lins.values(): |
| layer.reset_parameters() |
| for layer in self.self_lins.values(): |
| layer.reset_parameters() |
|
|
| def forward(self, x_dict, edge_index_dict, num_nodes_dict): |
| out_dict = { |
| node_type: self.self_lins[node_type](x_dict[node_type]) |
| for node_type in self.node_types |
| } |
| rel_count = {node_type: 1 for node_type in self.node_types} |
|
|
| for edge_type, edge_index in edge_index_dict.items(): |
| src_type, _, dst_type = edge_type |
| src, dst = edge_index |
| src_x = x_dict[src_type] |
| agg = src_x.new_zeros((num_nodes_dict[dst_type], src_x.size(-1))) |
| deg = src_x.new_zeros((num_nodes_dict[dst_type], 1)) |
| agg.index_add_(0, dst, src_x[src]) |
| deg.index_add_( |
| 0, dst, |
| torch.ones((dst.numel(), 1), dtype=src_x.dtype, device=src_x.device), |
| ) |
| agg = agg / deg.clamp(min=1.0) |
| out_dict[dst_type] = out_dict[dst_type] + self.rel_lins[self._key(edge_type)](agg) |
| rel_count[dst_type] += 1 |
|
|
| return { |
| node_type: out_dict[node_type] / rel_count[node_type] |
| for node_type in self.node_types |
| } |
|
|
|
|
| class HeteroRecommender(nn.Module): |
| def __init__(self, metadata, paper_in_dim, hidden_dim=64, out_dim=10, author_in_dim=512): |
| super().__init__() |
| self.author_emb = nn.Embedding(num_authors, author_in_dim) |
| self.paper_lin = nn.Linear(paper_in_dim, author_in_dim) |
| self.num_nodes_dict = {'author': num_authors, 'paper': num_papers} |
|
|
| self.conv1 = HeteroMeanConv( |
| metadata, |
| in_dims={'author': author_in_dim, 'paper': author_in_dim}, |
| out_dim=hidden_dim, |
| ) |
| self.conv2 = HeteroMeanConv( |
| metadata, |
| in_dims={'author': hidden_dim, 'paper': hidden_dim}, |
| out_dim=out_dim, |
| ) |
| self.reset_parameters() |
|
|
| def reset_parameters(self): |
| nn.init.xavier_uniform_(self.author_emb.weight) |
| self.paper_lin.reset_parameters() |
| self.conv1.reset_parameters() |
| self.conv2.reset_parameters() |
|
|
| def encode(self, data): |
| x_dict = { |
| 'author': self.author_emb.weight, |
| 'paper': self.paper_lin(data['paper'].x), |
| } |
| x_dict = self.conv1(x_dict, data.edge_index_dict, self.num_nodes_dict) |
| x_dict = {k: F.relu(v) for k, v in x_dict.items()} |
| x_dict = self.conv2(x_dict, data.edge_index_dict, self.num_nodes_dict) |
| return x_dict |
|
|
| def decode(self, z_dict, edge_label_index): |
| src, dst = edge_label_index |
| return (z_dict['author'][src] * z_dict['paper'][dst]).sum(dim=-1) |
|
|
|
|
| def sample_negative_edges(num_samples, num_authors, num_papers, existing_edges, device): |
| neg_edges = [] |
| while len(neg_edges) < num_samples: |
| need = num_samples - len(neg_edges) |
| src = torch.randint(0, num_authors, (need * 2,), device='cpu') |
| dst = torch.randint(0, num_papers, (need * 2,), device='cpu') |
| for s, d in zip(src.tolist(), dst.tolist()): |
| if (s, d) not in existing_edges: |
| neg_edges.append((s, d)) |
| if len(neg_edges) == num_samples: |
| break |
| return torch.tensor(neg_edges, dtype=torch.long, device=device).t().contiguous() |
|
|
|
|
| |
| model = HeteroRecommender( |
| data.metadata(), |
| paper_in_dim=data['paper'].x.size(-1), |
| hidden_dim=64, |
| out_dim=10, |
| author_in_dim=512, |
| ).to(device) |
|
|
| optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=1e-5) |
|
|
| pos_edge_index = data['author', 'ref', 'paper'].edge_index |
| existing_train_set = set(map(tuple, train_refs[['source', 'target']].to_numpy().tolist())) |
|
|
| train_edge_batch_size = min(32768, pos_edge_index.size(1)) |
| num_epochs = 60 |
|
|
| print(f"\nTraining {num_epochs} epochs (batch_size={train_edge_batch_size})...") |
| for epoch in range(num_epochs): |
| model.train() |
| optimizer.zero_grad() |
|
|
| batch_perm = torch.randperm(pos_edge_index.size(1), device=device)[:train_edge_batch_size] |
| pos_batch = pos_edge_index[:, batch_perm] |
| neg_batch = sample_negative_edges( |
| pos_batch.size(1), num_authors, num_papers, existing_train_set, device |
| ) |
|
|
| z_dict = model.encode(data) |
| pos_score = model.decode(z_dict, pos_batch) |
| neg_score = model.decode(z_dict, neg_batch) |
|
|
| loss = (1.0 - pos_score + neg_score).clamp(min=0).mean() |
| loss.backward() |
| optimizer.step() |
|
|
| if epoch % 10 == 0 or epoch == num_epochs - 1: |
| print(f'Epoch {epoch:03d}, loss={loss.item():.4f}') |
|
|
| |
| with torch.no_grad(): |
| model.eval() |
| node_embeddings = model.encode(data) |
| node_embeddings = {k: v.detach().cpu() for k, v in node_embeddings.items()} |
|
|
| from numpy.linalg import norm |
|
|
|
|
| def cos_sim(a, b, eps=1e-12): |
| return np.sum(a * b, axis=1) / (norm(a, axis=1) * norm(b, axis=1) + eps) |
|
|
|
|
| test_arr = test_refs[['source', 'target']].to_numpy(dtype=np.int64) |
| res = cos_sim( |
| node_embeddings['author'][test_arr[:, 0]].numpy(), |
| node_embeddings['paper'][test_arr[:, 1]].numpy(), |
| ) |
|
|
| lbl_true = test_refs['label'].to_numpy().flatten() |
|
|
| |
| precision, recall, thresholds = precision_recall_curve(lbl_true, np.array(res)) |
|
|
| |
| f1_scores = 2 * precision * recall / (precision + recall + 1e-12) |
| best_idx = np.argmax(f1_scores) |
| best_threshold = thresholds[best_idx] if best_idx < len(thresholds) else 0.5 |
| best_f1 = f1_scores[best_idx] |
|
|
| print(f"\nBest threshold (val): {best_threshold:.4f}, Best F1 (val): {best_f1:.4f}") |
|
|
| |
| lbl_pred_05 = (np.array(res) >= 0.5).astype(int) |
| print(f"F1 @ threshold=0.5: {f1_score(lbl_true, lbl_pred_05):.4f}") |
|
|
| |
| output_path = os.path.join("/home/lzc", "submission.csv") |
| test_arr_final = np.array(refs_to_pred, dtype=np.int64) |
| res_final = cos_sim( |
| node_embeddings['author'][test_arr_final[:, 0]].numpy(), |
| node_embeddings['paper'][test_arr_final[:, 1]].numpy(), |
| ) |
|
|
| res_pred = (res_final >= best_threshold).astype(int) |
|
|
| data_out = [[idx, str(int(p))] for idx, p in enumerate(res_pred)] |
| df = pd.DataFrame(data_out, columns=['Index', 'Predicted'], dtype=object) |
| df.to_csv(output_path, index=False) |
| print(f"\nSubmission saved to: {output_path}") |
| print(f"Predicted positive ratio: {res_pred.mean():.4f}") |
|
|