CS3319 Project 2 final deliverable (public F1 = 0.96626)

f28d994 12 days ago

13.9 kB

	"""Adapted from project-example-2026-pygver.ipynb for local execution."""
	import os
	import pickle as pkl
	import random

	import numpy as np
	import pandas as pd
	import torch
	import torch.nn as nn
	import torch.nn.functional as F
	import tqdm

	from torch_geometric.data import HeteroData
	from sklearn.metrics import f1_score, precision_recall_curve

	device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
	print('torch:', torch.__version__)
	print('device:', device)


	def set_seed(seed=0):
	random.seed(seed)
	np.random.seed(seed)
	torch.manual_seed(seed)
	if torch.cuda.is_available():
	torch.cuda.manual_seed_all(seed)


	set_seed(0)

	# ── Paths ──────────────────────────────────────────────────────────
	base_path = "/home/lzc/cs3319-project"

	cite_file = os.path.join(base_path, "paper_file_ann.txt")
	train_ref_file = os.path.join(base_path, "bipartite_train_ann.txt")
	test_ref_file = os.path.join(base_path, "bipartite_test_ann.txt")
	coauthor_file = os.path.join(base_path, "author_file_ann.txt")
	feature_file = os.path.join(base_path, "feature.pkl")


	def read_txt(file):
	res_list = []
	with open(file, "r") as f:
	for line in f:
	res_list.append(list(map(int, line.strip().split())))
	return res_list


	citation = read_txt(cite_file)
	existing_refs = read_txt(train_ref_file)
	refs_to_pred = read_txt(test_ref_file)
	coauthor = read_txt(coauthor_file)

	with open(feature_file, 'rb') as f:
	paper_feature = pkl.load(f)

	print(f"Number of citation edges: {len(citation)}")
	print(f"Number of existing references: {len(existing_refs)}")
	print(f"Number of author-paper pairs to predict: {len(refs_to_pred)}")
	print(f"Number of coauthor edges: {len(coauthor)}")
	print(f"Shape of paper features: {paper_feature.shape}")

	# ── Build edge dataframes ─────────────────────────────────────────
	cite_edges = pd.DataFrame(citation, columns=['source', 'target'])
	cite_edges = cite_edges.set_index("c-" + cite_edges.index.astype(str))

	ref_edges = pd.DataFrame(existing_refs, columns=['source', 'target'])
	ref_edges = ref_edges.set_index("r-" + ref_edges.index.astype(str))

	coauthor_edges = pd.DataFrame(coauthor, columns=['source', 'target'])
	coauthor_edges = coauthor_edges.set_index("a-" + coauthor_edges.index.astype(str))

	# ── Build node DataFrames ─────────────────────────────────────────
	node_tmp = pd.concat([cite_edges['source'], cite_edges['target'], ref_edges['target']])
	node_papers = pd.DataFrame(index=pd.unique(node_tmp))

	node_tmp = pd.concat([ref_edges['source'], coauthor_edges['source'], coauthor_edges['target']])
	node_authors = pd.DataFrame(index=pd.unique(node_tmp))

	print(f"Number of paper nodes: {len(node_papers)}, number of author nodes: {len(node_authors)}")

	# ── Train / validation split ──────────────────────────────────────
	train_refs = ref_edges.sample(frac=0.9, random_state=0, axis=0)
	test_true_refs = ref_edges[~ref_edges.index.isin(train_refs.index)].copy()
	test_true_refs['label'] = 1

	existing_ref_set = set(map(tuple, ref_edges[['source', 'target']].to_numpy().tolist()))
	num_test_pos = len(test_true_refs)
	author_ids = node_authors.index.to_numpy(dtype=np.int64)
	paper_ids = node_papers.index.to_numpy(dtype=np.int64)

	neg_pairs = []
	rng = np.random.default_rng(0)
	while len(neg_pairs) < num_test_pos:
	src = int(rng.choice(author_ids))
	dst = int(rng.choice(paper_ids))
	if (src, dst) not in existing_ref_set:
	neg_pairs.append((src, dst))

	test_false_refs = pd.DataFrame(neg_pairs, columns=['source', 'target'])
	test_false_refs['label'] = 0

	test_refs = pd.concat([test_true_refs, test_false_refs], ignore_index=True)
	test_refs = test_refs.sample(frac=1, random_state=0, axis=0).reset_index(drop=True)
	print(f"Validation set: {len(test_refs)} pairs "
	f"(pos={test_refs['label'].sum()}, neg={len(test_refs) - test_refs['label'].sum()})")

	# ── Build HeteroData ──────────────────────────────────────────────
	train_ref_tensor = torch.as_tensor(train_refs[['source', 'target']].to_numpy(), dtype=torch.long)
	cite_tensor = torch.as_tensor(cite_edges[['source', 'target']].to_numpy(), dtype=torch.long)
	coauthor_tensor = torch.as_tensor(coauthor_edges[['source', 'target']].to_numpy(), dtype=torch.long)

	test_ref_arr = np.array(refs_to_pred, dtype=np.int64) if len(refs_to_pred) > 0 else np.zeros((0, 2), dtype=np.int64)
	num_authors = int(max(
	ref_edges['source'].max(),
	coauthor_edges['source'].max(),
	coauthor_edges['target'].max(),
	test_ref_arr[:, 0].max() if len(test_ref_arr) else 0,
	) + 1)
	num_papers = int(max(
	cite_edges['source'].max(),
	cite_edges['target'].max(),
	ref_edges['target'].max(),
	test_ref_arr[:, 1].max() if len(test_ref_arr) else 0,
	paper_feature.shape[0] - 1,
	) + 1)

	paper_x = torch.as_tensor(paper_feature, dtype=torch.float)
	if paper_x.size(0) < num_papers:
	pad = torch.zeros(num_papers - paper_x.size(0), paper_x.size(1), dtype=paper_x.dtype)
	paper_x = torch.cat([paper_x, pad], dim=0)
	elif paper_x.size(0) > num_papers:
	paper_x = paper_x[:num_papers]

	data = HeteroData()
	data['author'].num_nodes = num_authors
	data['paper'].x = paper_x
	data['paper'].num_nodes = num_papers
	data['author', 'ref', 'paper'].edge_index = train_ref_tensor.t().contiguous()
	data['paper', 'beref', 'author'].edge_index = train_ref_tensor[:, [1, 0]].t().contiguous()
	data['paper', 'cite', 'paper'].edge_index = torch.cat([
	cite_tensor,
	cite_tensor[:, [1, 0]],
	], dim=0).t().contiguous()
	data['author', 'coauthor', 'author'].edge_index = torch.cat([
	coauthor_tensor,
	coauthor_tensor[:, [1, 0]],
	], dim=0).t().contiguous()

	data = data.to(device)
	print(data)
	print('metadata:', data.metadata())


	# ── Model ─────────────────────────────────────────────────────────
	class HeteroMeanConv(nn.Module):
	def __init__(self, metadata, in_dims, out_dim):
	super().__init__()
	node_types, edge_types = metadata
	self.node_types = list(node_types)
	self.edge_types = list(edge_types)
	self.rel_lins = nn.ModuleDict({
	self._key(edge_type): nn.Linear(in_dims[edge_type[0]], out_dim, bias=False)
	for edge_type in self.edge_types
	})
	self.self_lins = nn.ModuleDict({
	node_type: nn.Linear(in_dims[node_type], out_dim)
	for node_type in self.node_types
	})

	@staticmethod
	def _key(edge_type):
	return '__'.join(edge_type)

	def reset_parameters(self):
	for layer in self.rel_lins.values():
	layer.reset_parameters()
	for layer in self.self_lins.values():
	layer.reset_parameters()

	def forward(self, x_dict, edge_index_dict, num_nodes_dict):
	out_dict = {
	node_type: self.self_lins[node_type](x_dict[node_type])
	for node_type in self.node_types
	}
	rel_count = {node_type: 1 for node_type in self.node_types}

	for edge_type, edge_index in edge_index_dict.items():
	src_type, _, dst_type = edge_type
	src, dst = edge_index
	src_x = x_dict[src_type]
	agg = src_x.new_zeros((num_nodes_dict[dst_type], src_x.size(-1)))
	deg = src_x.new_zeros((num_nodes_dict[dst_type], 1))
	agg.index_add_(0, dst, src_x[src])
	deg.index_add_(
	0, dst,
	torch.ones((dst.numel(), 1), dtype=src_x.dtype, device=src_x.device),
	)
	agg = agg / deg.clamp(min=1.0)
	out_dict[dst_type] = out_dict[dst_type] + self.rel_lins[self._key(edge_type)](agg)
	rel_count[dst_type] += 1

	return {
	node_type: out_dict[node_type] / rel_count[node_type]
	for node_type in self.node_types
	}


	class HeteroRecommender(nn.Module):
	def __init__(self, metadata, paper_in_dim, hidden_dim=64, out_dim=10, author_in_dim=512):
	super().__init__()
	self.author_emb = nn.Embedding(num_authors, author_in_dim)
	self.paper_lin = nn.Linear(paper_in_dim, author_in_dim)
	self.num_nodes_dict = {'author': num_authors, 'paper': num_papers}

	self.conv1 = HeteroMeanConv(
	metadata,
	in_dims={'author': author_in_dim, 'paper': author_in_dim},
	out_dim=hidden_dim,
	)
	self.conv2 = HeteroMeanConv(
	metadata,
	in_dims={'author': hidden_dim, 'paper': hidden_dim},
	out_dim=out_dim,
	)
	self.reset_parameters()

	def reset_parameters(self):
	nn.init.xavier_uniform_(self.author_emb.weight)
	self.paper_lin.reset_parameters()
	self.conv1.reset_parameters()
	self.conv2.reset_parameters()

	def encode(self, data):
	x_dict = {
	'author': self.author_emb.weight,
	'paper': self.paper_lin(data['paper'].x),
	}
	x_dict = self.conv1(x_dict, data.edge_index_dict, self.num_nodes_dict)
	x_dict = {k: F.relu(v) for k, v in x_dict.items()}
	x_dict = self.conv2(x_dict, data.edge_index_dict, self.num_nodes_dict)
	return x_dict

	def decode(self, z_dict, edge_label_index):
	src, dst = edge_label_index
	return (z_dict['author'][src] * z_dict['paper'][dst]).sum(dim=-1)


	def sample_negative_edges(num_samples, num_authors, num_papers, existing_edges, device):
	neg_edges = []
	while len(neg_edges) < num_samples:
	need = num_samples - len(neg_edges)
	src = torch.randint(0, num_authors, (need * 2,), device='cpu')
	dst = torch.randint(0, num_papers, (need * 2,), device='cpu')
	for s, d in zip(src.tolist(), dst.tolist()):
	if (s, d) not in existing_edges:
	neg_edges.append((s, d))
	if len(neg_edges) == num_samples:
	break
	return torch.tensor(neg_edges, dtype=torch.long, device=device).t().contiguous()


	# ── Training ──────────────────────────────────────────────────────
	model = HeteroRecommender(
	data.metadata(),
	paper_in_dim=data['paper'].x.size(-1),
	hidden_dim=64,
	out_dim=10,
	author_in_dim=512,
	).to(device)

	optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=1e-5)

	pos_edge_index = data['author', 'ref', 'paper'].edge_index
	existing_train_set = set(map(tuple, train_refs[['source', 'target']].to_numpy().tolist()))

	train_edge_batch_size = min(32768, pos_edge_index.size(1))
	num_epochs = 60

	print(f"\nTraining {num_epochs} epochs (batch_size={train_edge_batch_size})...")
	for epoch in range(num_epochs):
	model.train()
	optimizer.zero_grad()

	batch_perm = torch.randperm(pos_edge_index.size(1), device=device)[:train_edge_batch_size]
	pos_batch = pos_edge_index[:, batch_perm]
	neg_batch = sample_negative_edges(
	pos_batch.size(1), num_authors, num_papers, existing_train_set, device
	)

	z_dict = model.encode(data)
	pos_score = model.decode(z_dict, pos_batch)
	neg_score = model.decode(z_dict, neg_batch)

	loss = (1.0 - pos_score + neg_score).clamp(min=0).mean()
	loss.backward()
	optimizer.step()

	if epoch % 10 == 0 or epoch == num_epochs - 1:
	print(f'Epoch {epoch:03d}, loss={loss.item():.4f}')

	# ── Evaluation on validation set ──────────────────────────────────
	with torch.no_grad():
	model.eval()
	node_embeddings = model.encode(data)
	node_embeddings = {k: v.detach().cpu() for k, v in node_embeddings.items()}

	from numpy.linalg import norm


	def cos_sim(a, b, eps=1e-12):
	return np.sum(a * b, axis=1) / (norm(a, axis=1) * norm(b, axis=1) + eps)


	test_arr = test_refs[['source', 'target']].to_numpy(dtype=np.int64)
	res = cos_sim(
	node_embeddings['author'][test_arr[:, 0]].numpy(),
	node_embeddings['paper'][test_arr[:, 1]].numpy(),
	)

	lbl_true = test_refs['label'].to_numpy().flatten()

	# Threshold search for best F1
	precision, recall, thresholds = precision_recall_curve(lbl_true, np.array(res))

	# Find best F1 threshold
	f1_scores = 2 * precision * recall / (precision + recall + 1e-12)
	best_idx = np.argmax(f1_scores)
	best_threshold = thresholds[best_idx] if best_idx < len(thresholds) else 0.5
	best_f1 = f1_scores[best_idx]

	print(f"\nBest threshold (val): {best_threshold:.4f}, Best F1 (val): {best_f1:.4f}")

	# F1 at threshold=0.5
	lbl_pred_05 = (np.array(res) >= 0.5).astype(int)
	print(f"F1 @ threshold=0.5: {f1_score(lbl_true, lbl_pred_05):.4f}")

	# ── Generate submission ───────────────────────────────────────────
	output_path = os.path.join("/home/lzc", "submission.csv")
	test_arr_final = np.array(refs_to_pred, dtype=np.int64)
	res_final = cos_sim(
	node_embeddings['author'][test_arr_final[:, 0]].numpy(),
	node_embeddings['paper'][test_arr_final[:, 1]].numpy(),
	)

	res_pred = (res_final >= best_threshold).astype(int)

	data_out = [[idx, str(int(p))] for idx, p in enumerate(res_pred)]
	df = pd.DataFrame(data_out, columns=['Index', 'Predicted'], dtype=object)
	df.to_csv(output_path, index=False)
	print(f"\nSubmission saved to: {output_path}")
	print(f"Predicted positive ratio: {res_pred.mean():.4f}")