"""Adapted from project-example-2026-pygver.ipynb for local execution.""" import os import pickle as pkl import random import numpy as np import pandas as pd import torch import torch.nn as nn import torch.nn.functional as F import tqdm from torch_geometric.data import HeteroData from sklearn.metrics import f1_score, precision_recall_curve device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu') print('torch:', torch.__version__) print('device:', device) def set_seed(seed=0): random.seed(seed) np.random.seed(seed) torch.manual_seed(seed) if torch.cuda.is_available(): torch.cuda.manual_seed_all(seed) set_seed(0) # ── Paths ────────────────────────────────────────────────────────── base_path = "/home/lzc/cs3319-project" cite_file = os.path.join(base_path, "paper_file_ann.txt") train_ref_file = os.path.join(base_path, "bipartite_train_ann.txt") test_ref_file = os.path.join(base_path, "bipartite_test_ann.txt") coauthor_file = os.path.join(base_path, "author_file_ann.txt") feature_file = os.path.join(base_path, "feature.pkl") def read_txt(file): res_list = [] with open(file, "r") as f: for line in f: res_list.append(list(map(int, line.strip().split()))) return res_list citation = read_txt(cite_file) existing_refs = read_txt(train_ref_file) refs_to_pred = read_txt(test_ref_file) coauthor = read_txt(coauthor_file) with open(feature_file, 'rb') as f: paper_feature = pkl.load(f) print(f"Number of citation edges: {len(citation)}") print(f"Number of existing references: {len(existing_refs)}") print(f"Number of author-paper pairs to predict: {len(refs_to_pred)}") print(f"Number of coauthor edges: {len(coauthor)}") print(f"Shape of paper features: {paper_feature.shape}") # ── Build edge dataframes ───────────────────────────────────────── cite_edges = pd.DataFrame(citation, columns=['source', 'target']) cite_edges = cite_edges.set_index("c-" + cite_edges.index.astype(str)) ref_edges = pd.DataFrame(existing_refs, columns=['source', 'target']) ref_edges = ref_edges.set_index("r-" + ref_edges.index.astype(str)) coauthor_edges = pd.DataFrame(coauthor, columns=['source', 'target']) coauthor_edges = coauthor_edges.set_index("a-" + coauthor_edges.index.astype(str)) # ── Build node DataFrames ───────────────────────────────────────── node_tmp = pd.concat([cite_edges['source'], cite_edges['target'], ref_edges['target']]) node_papers = pd.DataFrame(index=pd.unique(node_tmp)) node_tmp = pd.concat([ref_edges['source'], coauthor_edges['source'], coauthor_edges['target']]) node_authors = pd.DataFrame(index=pd.unique(node_tmp)) print(f"Number of paper nodes: {len(node_papers)}, number of author nodes: {len(node_authors)}") # ── Train / validation split ────────────────────────────────────── train_refs = ref_edges.sample(frac=0.9, random_state=0, axis=0) test_true_refs = ref_edges[~ref_edges.index.isin(train_refs.index)].copy() test_true_refs['label'] = 1 existing_ref_set = set(map(tuple, ref_edges[['source', 'target']].to_numpy().tolist())) num_test_pos = len(test_true_refs) author_ids = node_authors.index.to_numpy(dtype=np.int64) paper_ids = node_papers.index.to_numpy(dtype=np.int64) neg_pairs = [] rng = np.random.default_rng(0) while len(neg_pairs) < num_test_pos: src = int(rng.choice(author_ids)) dst = int(rng.choice(paper_ids)) if (src, dst) not in existing_ref_set: neg_pairs.append((src, dst)) test_false_refs = pd.DataFrame(neg_pairs, columns=['source', 'target']) test_false_refs['label'] = 0 test_refs = pd.concat([test_true_refs, test_false_refs], ignore_index=True) test_refs = test_refs.sample(frac=1, random_state=0, axis=0).reset_index(drop=True) print(f"Validation set: {len(test_refs)} pairs " f"(pos={test_refs['label'].sum()}, neg={len(test_refs) - test_refs['label'].sum()})") # ── Build HeteroData ────────────────────────────────────────────── train_ref_tensor = torch.as_tensor(train_refs[['source', 'target']].to_numpy(), dtype=torch.long) cite_tensor = torch.as_tensor(cite_edges[['source', 'target']].to_numpy(), dtype=torch.long) coauthor_tensor = torch.as_tensor(coauthor_edges[['source', 'target']].to_numpy(), dtype=torch.long) test_ref_arr = np.array(refs_to_pred, dtype=np.int64) if len(refs_to_pred) > 0 else np.zeros((0, 2), dtype=np.int64) num_authors = int(max( ref_edges['source'].max(), coauthor_edges['source'].max(), coauthor_edges['target'].max(), test_ref_arr[:, 0].max() if len(test_ref_arr) else 0, ) + 1) num_papers = int(max( cite_edges['source'].max(), cite_edges['target'].max(), ref_edges['target'].max(), test_ref_arr[:, 1].max() if len(test_ref_arr) else 0, paper_feature.shape[0] - 1, ) + 1) paper_x = torch.as_tensor(paper_feature, dtype=torch.float) if paper_x.size(0) < num_papers: pad = torch.zeros(num_papers - paper_x.size(0), paper_x.size(1), dtype=paper_x.dtype) paper_x = torch.cat([paper_x, pad], dim=0) elif paper_x.size(0) > num_papers: paper_x = paper_x[:num_papers] data = HeteroData() data['author'].num_nodes = num_authors data['paper'].x = paper_x data['paper'].num_nodes = num_papers data['author', 'ref', 'paper'].edge_index = train_ref_tensor.t().contiguous() data['paper', 'beref', 'author'].edge_index = train_ref_tensor[:, [1, 0]].t().contiguous() data['paper', 'cite', 'paper'].edge_index = torch.cat([ cite_tensor, cite_tensor[:, [1, 0]], ], dim=0).t().contiguous() data['author', 'coauthor', 'author'].edge_index = torch.cat([ coauthor_tensor, coauthor_tensor[:, [1, 0]], ], dim=0).t().contiguous() data = data.to(device) print(data) print('metadata:', data.metadata()) # ── Model ───────────────────────────────────────────────────────── class HeteroMeanConv(nn.Module): def __init__(self, metadata, in_dims, out_dim): super().__init__() node_types, edge_types = metadata self.node_types = list(node_types) self.edge_types = list(edge_types) self.rel_lins = nn.ModuleDict({ self._key(edge_type): nn.Linear(in_dims[edge_type[0]], out_dim, bias=False) for edge_type in self.edge_types }) self.self_lins = nn.ModuleDict({ node_type: nn.Linear(in_dims[node_type], out_dim) for node_type in self.node_types }) @staticmethod def _key(edge_type): return '__'.join(edge_type) def reset_parameters(self): for layer in self.rel_lins.values(): layer.reset_parameters() for layer in self.self_lins.values(): layer.reset_parameters() def forward(self, x_dict, edge_index_dict, num_nodes_dict): out_dict = { node_type: self.self_lins[node_type](x_dict[node_type]) for node_type in self.node_types } rel_count = {node_type: 1 for node_type in self.node_types} for edge_type, edge_index in edge_index_dict.items(): src_type, _, dst_type = edge_type src, dst = edge_index src_x = x_dict[src_type] agg = src_x.new_zeros((num_nodes_dict[dst_type], src_x.size(-1))) deg = src_x.new_zeros((num_nodes_dict[dst_type], 1)) agg.index_add_(0, dst, src_x[src]) deg.index_add_( 0, dst, torch.ones((dst.numel(), 1), dtype=src_x.dtype, device=src_x.device), ) agg = agg / deg.clamp(min=1.0) out_dict[dst_type] = out_dict[dst_type] + self.rel_lins[self._key(edge_type)](agg) rel_count[dst_type] += 1 return { node_type: out_dict[node_type] / rel_count[node_type] for node_type in self.node_types } class HeteroRecommender(nn.Module): def __init__(self, metadata, paper_in_dim, hidden_dim=64, out_dim=10, author_in_dim=512): super().__init__() self.author_emb = nn.Embedding(num_authors, author_in_dim) self.paper_lin = nn.Linear(paper_in_dim, author_in_dim) self.num_nodes_dict = {'author': num_authors, 'paper': num_papers} self.conv1 = HeteroMeanConv( metadata, in_dims={'author': author_in_dim, 'paper': author_in_dim}, out_dim=hidden_dim, ) self.conv2 = HeteroMeanConv( metadata, in_dims={'author': hidden_dim, 'paper': hidden_dim}, out_dim=out_dim, ) self.reset_parameters() def reset_parameters(self): nn.init.xavier_uniform_(self.author_emb.weight) self.paper_lin.reset_parameters() self.conv1.reset_parameters() self.conv2.reset_parameters() def encode(self, data): x_dict = { 'author': self.author_emb.weight, 'paper': self.paper_lin(data['paper'].x), } x_dict = self.conv1(x_dict, data.edge_index_dict, self.num_nodes_dict) x_dict = {k: F.relu(v) for k, v in x_dict.items()} x_dict = self.conv2(x_dict, data.edge_index_dict, self.num_nodes_dict) return x_dict def decode(self, z_dict, edge_label_index): src, dst = edge_label_index return (z_dict['author'][src] * z_dict['paper'][dst]).sum(dim=-1) def sample_negative_edges(num_samples, num_authors, num_papers, existing_edges, device): neg_edges = [] while len(neg_edges) < num_samples: need = num_samples - len(neg_edges) src = torch.randint(0, num_authors, (need * 2,), device='cpu') dst = torch.randint(0, num_papers, (need * 2,), device='cpu') for s, d in zip(src.tolist(), dst.tolist()): if (s, d) not in existing_edges: neg_edges.append((s, d)) if len(neg_edges) == num_samples: break return torch.tensor(neg_edges, dtype=torch.long, device=device).t().contiguous() # ── Training ────────────────────────────────────────────────────── model = HeteroRecommender( data.metadata(), paper_in_dim=data['paper'].x.size(-1), hidden_dim=64, out_dim=10, author_in_dim=512, ).to(device) optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=1e-5) pos_edge_index = data['author', 'ref', 'paper'].edge_index existing_train_set = set(map(tuple, train_refs[['source', 'target']].to_numpy().tolist())) train_edge_batch_size = min(32768, pos_edge_index.size(1)) num_epochs = 60 print(f"\nTraining {num_epochs} epochs (batch_size={train_edge_batch_size})...") for epoch in range(num_epochs): model.train() optimizer.zero_grad() batch_perm = torch.randperm(pos_edge_index.size(1), device=device)[:train_edge_batch_size] pos_batch = pos_edge_index[:, batch_perm] neg_batch = sample_negative_edges( pos_batch.size(1), num_authors, num_papers, existing_train_set, device ) z_dict = model.encode(data) pos_score = model.decode(z_dict, pos_batch) neg_score = model.decode(z_dict, neg_batch) loss = (1.0 - pos_score + neg_score).clamp(min=0).mean() loss.backward() optimizer.step() if epoch % 10 == 0 or epoch == num_epochs - 1: print(f'Epoch {epoch:03d}, loss={loss.item():.4f}') # ── Evaluation on validation set ────────────────────────────────── with torch.no_grad(): model.eval() node_embeddings = model.encode(data) node_embeddings = {k: v.detach().cpu() for k, v in node_embeddings.items()} from numpy.linalg import norm def cos_sim(a, b, eps=1e-12): return np.sum(a * b, axis=1) / (norm(a, axis=1) * norm(b, axis=1) + eps) test_arr = test_refs[['source', 'target']].to_numpy(dtype=np.int64) res = cos_sim( node_embeddings['author'][test_arr[:, 0]].numpy(), node_embeddings['paper'][test_arr[:, 1]].numpy(), ) lbl_true = test_refs['label'].to_numpy().flatten() # Threshold search for best F1 precision, recall, thresholds = precision_recall_curve(lbl_true, np.array(res)) # Find best F1 threshold f1_scores = 2 * precision * recall / (precision + recall + 1e-12) best_idx = np.argmax(f1_scores) best_threshold = thresholds[best_idx] if best_idx < len(thresholds) else 0.5 best_f1 = f1_scores[best_idx] print(f"\nBest threshold (val): {best_threshold:.4f}, Best F1 (val): {best_f1:.4f}") # F1 at threshold=0.5 lbl_pred_05 = (np.array(res) >= 0.5).astype(int) print(f"F1 @ threshold=0.5: {f1_score(lbl_true, lbl_pred_05):.4f}") # ── Generate submission ─────────────────────────────────────────── output_path = os.path.join("/home/lzc", "submission.csv") test_arr_final = np.array(refs_to_pred, dtype=np.int64) res_final = cos_sim( node_embeddings['author'][test_arr_final[:, 0]].numpy(), node_embeddings['paper'][test_arr_final[:, 1]].numpy(), ) res_pred = (res_final >= best_threshold).astype(int) data_out = [[idx, str(int(p))] for idx, p in enumerate(res_pred)] df = pd.DataFrame(data_out, columns=['Index', 'Predicted'], dtype=object) df.to_csv(output_path, index=False) print(f"\nSubmission saved to: {output_path}") print(f"Predicted positive ratio: {res_pred.mean():.4f}")