cs3319-project2 / code /run_baseline.py
NLP-beginner's picture
CS3319 Project 2 final deliverable (public F1 = 0.96626)
f28d994
Raw
History Blame Contribute Delete
13.9 kB
"""Adapted from project-example-2026-pygver.ipynb for local execution."""
import os
import pickle as pkl
import random
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
import tqdm
from torch_geometric.data import HeteroData
from sklearn.metrics import f1_score, precision_recall_curve
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
print('torch:', torch.__version__)
print('device:', device)
def set_seed(seed=0):
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
if torch.cuda.is_available():
torch.cuda.manual_seed_all(seed)
set_seed(0)
# ── Paths ──────────────────────────────────────────────────────────
base_path = "/home/lzc/cs3319-project"
cite_file = os.path.join(base_path, "paper_file_ann.txt")
train_ref_file = os.path.join(base_path, "bipartite_train_ann.txt")
test_ref_file = os.path.join(base_path, "bipartite_test_ann.txt")
coauthor_file = os.path.join(base_path, "author_file_ann.txt")
feature_file = os.path.join(base_path, "feature.pkl")
def read_txt(file):
res_list = []
with open(file, "r") as f:
for line in f:
res_list.append(list(map(int, line.strip().split())))
return res_list
citation = read_txt(cite_file)
existing_refs = read_txt(train_ref_file)
refs_to_pred = read_txt(test_ref_file)
coauthor = read_txt(coauthor_file)
with open(feature_file, 'rb') as f:
paper_feature = pkl.load(f)
print(f"Number of citation edges: {len(citation)}")
print(f"Number of existing references: {len(existing_refs)}")
print(f"Number of author-paper pairs to predict: {len(refs_to_pred)}")
print(f"Number of coauthor edges: {len(coauthor)}")
print(f"Shape of paper features: {paper_feature.shape}")
# ── Build edge dataframes ─────────────────────────────────────────
cite_edges = pd.DataFrame(citation, columns=['source', 'target'])
cite_edges = cite_edges.set_index("c-" + cite_edges.index.astype(str))
ref_edges = pd.DataFrame(existing_refs, columns=['source', 'target'])
ref_edges = ref_edges.set_index("r-" + ref_edges.index.astype(str))
coauthor_edges = pd.DataFrame(coauthor, columns=['source', 'target'])
coauthor_edges = coauthor_edges.set_index("a-" + coauthor_edges.index.astype(str))
# ── Build node DataFrames ─────────────────────────────────────────
node_tmp = pd.concat([cite_edges['source'], cite_edges['target'], ref_edges['target']])
node_papers = pd.DataFrame(index=pd.unique(node_tmp))
node_tmp = pd.concat([ref_edges['source'], coauthor_edges['source'], coauthor_edges['target']])
node_authors = pd.DataFrame(index=pd.unique(node_tmp))
print(f"Number of paper nodes: {len(node_papers)}, number of author nodes: {len(node_authors)}")
# ── Train / validation split ──────────────────────────────────────
train_refs = ref_edges.sample(frac=0.9, random_state=0, axis=0)
test_true_refs = ref_edges[~ref_edges.index.isin(train_refs.index)].copy()
test_true_refs['label'] = 1
existing_ref_set = set(map(tuple, ref_edges[['source', 'target']].to_numpy().tolist()))
num_test_pos = len(test_true_refs)
author_ids = node_authors.index.to_numpy(dtype=np.int64)
paper_ids = node_papers.index.to_numpy(dtype=np.int64)
neg_pairs = []
rng = np.random.default_rng(0)
while len(neg_pairs) < num_test_pos:
src = int(rng.choice(author_ids))
dst = int(rng.choice(paper_ids))
if (src, dst) not in existing_ref_set:
neg_pairs.append((src, dst))
test_false_refs = pd.DataFrame(neg_pairs, columns=['source', 'target'])
test_false_refs['label'] = 0
test_refs = pd.concat([test_true_refs, test_false_refs], ignore_index=True)
test_refs = test_refs.sample(frac=1, random_state=0, axis=0).reset_index(drop=True)
print(f"Validation set: {len(test_refs)} pairs "
f"(pos={test_refs['label'].sum()}, neg={len(test_refs) - test_refs['label'].sum()})")
# ── Build HeteroData ──────────────────────────────────────────────
train_ref_tensor = torch.as_tensor(train_refs[['source', 'target']].to_numpy(), dtype=torch.long)
cite_tensor = torch.as_tensor(cite_edges[['source', 'target']].to_numpy(), dtype=torch.long)
coauthor_tensor = torch.as_tensor(coauthor_edges[['source', 'target']].to_numpy(), dtype=torch.long)
test_ref_arr = np.array(refs_to_pred, dtype=np.int64) if len(refs_to_pred) > 0 else np.zeros((0, 2), dtype=np.int64)
num_authors = int(max(
ref_edges['source'].max(),
coauthor_edges['source'].max(),
coauthor_edges['target'].max(),
test_ref_arr[:, 0].max() if len(test_ref_arr) else 0,
) + 1)
num_papers = int(max(
cite_edges['source'].max(),
cite_edges['target'].max(),
ref_edges['target'].max(),
test_ref_arr[:, 1].max() if len(test_ref_arr) else 0,
paper_feature.shape[0] - 1,
) + 1)
paper_x = torch.as_tensor(paper_feature, dtype=torch.float)
if paper_x.size(0) < num_papers:
pad = torch.zeros(num_papers - paper_x.size(0), paper_x.size(1), dtype=paper_x.dtype)
paper_x = torch.cat([paper_x, pad], dim=0)
elif paper_x.size(0) > num_papers:
paper_x = paper_x[:num_papers]
data = HeteroData()
data['author'].num_nodes = num_authors
data['paper'].x = paper_x
data['paper'].num_nodes = num_papers
data['author', 'ref', 'paper'].edge_index = train_ref_tensor.t().contiguous()
data['paper', 'beref', 'author'].edge_index = train_ref_tensor[:, [1, 0]].t().contiguous()
data['paper', 'cite', 'paper'].edge_index = torch.cat([
cite_tensor,
cite_tensor[:, [1, 0]],
], dim=0).t().contiguous()
data['author', 'coauthor', 'author'].edge_index = torch.cat([
coauthor_tensor,
coauthor_tensor[:, [1, 0]],
], dim=0).t().contiguous()
data = data.to(device)
print(data)
print('metadata:', data.metadata())
# ── Model ─────────────────────────────────────────────────────────
class HeteroMeanConv(nn.Module):
def __init__(self, metadata, in_dims, out_dim):
super().__init__()
node_types, edge_types = metadata
self.node_types = list(node_types)
self.edge_types = list(edge_types)
self.rel_lins = nn.ModuleDict({
self._key(edge_type): nn.Linear(in_dims[edge_type[0]], out_dim, bias=False)
for edge_type in self.edge_types
})
self.self_lins = nn.ModuleDict({
node_type: nn.Linear(in_dims[node_type], out_dim)
for node_type in self.node_types
})
@staticmethod
def _key(edge_type):
return '__'.join(edge_type)
def reset_parameters(self):
for layer in self.rel_lins.values():
layer.reset_parameters()
for layer in self.self_lins.values():
layer.reset_parameters()
def forward(self, x_dict, edge_index_dict, num_nodes_dict):
out_dict = {
node_type: self.self_lins[node_type](x_dict[node_type])
for node_type in self.node_types
}
rel_count = {node_type: 1 for node_type in self.node_types}
for edge_type, edge_index in edge_index_dict.items():
src_type, _, dst_type = edge_type
src, dst = edge_index
src_x = x_dict[src_type]
agg = src_x.new_zeros((num_nodes_dict[dst_type], src_x.size(-1)))
deg = src_x.new_zeros((num_nodes_dict[dst_type], 1))
agg.index_add_(0, dst, src_x[src])
deg.index_add_(
0, dst,
torch.ones((dst.numel(), 1), dtype=src_x.dtype, device=src_x.device),
)
agg = agg / deg.clamp(min=1.0)
out_dict[dst_type] = out_dict[dst_type] + self.rel_lins[self._key(edge_type)](agg)
rel_count[dst_type] += 1
return {
node_type: out_dict[node_type] / rel_count[node_type]
for node_type in self.node_types
}
class HeteroRecommender(nn.Module):
def __init__(self, metadata, paper_in_dim, hidden_dim=64, out_dim=10, author_in_dim=512):
super().__init__()
self.author_emb = nn.Embedding(num_authors, author_in_dim)
self.paper_lin = nn.Linear(paper_in_dim, author_in_dim)
self.num_nodes_dict = {'author': num_authors, 'paper': num_papers}
self.conv1 = HeteroMeanConv(
metadata,
in_dims={'author': author_in_dim, 'paper': author_in_dim},
out_dim=hidden_dim,
)
self.conv2 = HeteroMeanConv(
metadata,
in_dims={'author': hidden_dim, 'paper': hidden_dim},
out_dim=out_dim,
)
self.reset_parameters()
def reset_parameters(self):
nn.init.xavier_uniform_(self.author_emb.weight)
self.paper_lin.reset_parameters()
self.conv1.reset_parameters()
self.conv2.reset_parameters()
def encode(self, data):
x_dict = {
'author': self.author_emb.weight,
'paper': self.paper_lin(data['paper'].x),
}
x_dict = self.conv1(x_dict, data.edge_index_dict, self.num_nodes_dict)
x_dict = {k: F.relu(v) for k, v in x_dict.items()}
x_dict = self.conv2(x_dict, data.edge_index_dict, self.num_nodes_dict)
return x_dict
def decode(self, z_dict, edge_label_index):
src, dst = edge_label_index
return (z_dict['author'][src] * z_dict['paper'][dst]).sum(dim=-1)
def sample_negative_edges(num_samples, num_authors, num_papers, existing_edges, device):
neg_edges = []
while len(neg_edges) < num_samples:
need = num_samples - len(neg_edges)
src = torch.randint(0, num_authors, (need * 2,), device='cpu')
dst = torch.randint(0, num_papers, (need * 2,), device='cpu')
for s, d in zip(src.tolist(), dst.tolist()):
if (s, d) not in existing_edges:
neg_edges.append((s, d))
if len(neg_edges) == num_samples:
break
return torch.tensor(neg_edges, dtype=torch.long, device=device).t().contiguous()
# ── Training ──────────────────────────────────────────────────────
model = HeteroRecommender(
data.metadata(),
paper_in_dim=data['paper'].x.size(-1),
hidden_dim=64,
out_dim=10,
author_in_dim=512,
).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=1e-5)
pos_edge_index = data['author', 'ref', 'paper'].edge_index
existing_train_set = set(map(tuple, train_refs[['source', 'target']].to_numpy().tolist()))
train_edge_batch_size = min(32768, pos_edge_index.size(1))
num_epochs = 60
print(f"\nTraining {num_epochs} epochs (batch_size={train_edge_batch_size})...")
for epoch in range(num_epochs):
model.train()
optimizer.zero_grad()
batch_perm = torch.randperm(pos_edge_index.size(1), device=device)[:train_edge_batch_size]
pos_batch = pos_edge_index[:, batch_perm]
neg_batch = sample_negative_edges(
pos_batch.size(1), num_authors, num_papers, existing_train_set, device
)
z_dict = model.encode(data)
pos_score = model.decode(z_dict, pos_batch)
neg_score = model.decode(z_dict, neg_batch)
loss = (1.0 - pos_score + neg_score).clamp(min=0).mean()
loss.backward()
optimizer.step()
if epoch % 10 == 0 or epoch == num_epochs - 1:
print(f'Epoch {epoch:03d}, loss={loss.item():.4f}')
# ── Evaluation on validation set ──────────────────────────────────
with torch.no_grad():
model.eval()
node_embeddings = model.encode(data)
node_embeddings = {k: v.detach().cpu() for k, v in node_embeddings.items()}
from numpy.linalg import norm
def cos_sim(a, b, eps=1e-12):
return np.sum(a * b, axis=1) / (norm(a, axis=1) * norm(b, axis=1) + eps)
test_arr = test_refs[['source', 'target']].to_numpy(dtype=np.int64)
res = cos_sim(
node_embeddings['author'][test_arr[:, 0]].numpy(),
node_embeddings['paper'][test_arr[:, 1]].numpy(),
)
lbl_true = test_refs['label'].to_numpy().flatten()
# Threshold search for best F1
precision, recall, thresholds = precision_recall_curve(lbl_true, np.array(res))
# Find best F1 threshold
f1_scores = 2 * precision * recall / (precision + recall + 1e-12)
best_idx = np.argmax(f1_scores)
best_threshold = thresholds[best_idx] if best_idx < len(thresholds) else 0.5
best_f1 = f1_scores[best_idx]
print(f"\nBest threshold (val): {best_threshold:.4f}, Best F1 (val): {best_f1:.4f}")
# F1 at threshold=0.5
lbl_pred_05 = (np.array(res) >= 0.5).astype(int)
print(f"F1 @ threshold=0.5: {f1_score(lbl_true, lbl_pred_05):.4f}")
# ── Generate submission ───────────────────────────────────────────
output_path = os.path.join("/home/lzc", "submission.csv")
test_arr_final = np.array(refs_to_pred, dtype=np.int64)
res_final = cos_sim(
node_embeddings['author'][test_arr_final[:, 0]].numpy(),
node_embeddings['paper'][test_arr_final[:, 1]].numpy(),
)
res_pred = (res_final >= best_threshold).astype(int)
data_out = [[idx, str(int(p))] for idx, p in enumerate(res_pred)]
df = pd.DataFrame(data_out, columns=['Index', 'Predicted'], dtype=object)
df.to_csv(output_path, index=False)
print(f"\nSubmission saved to: {output_path}")
print(f"Predicted positive ratio: {res_pred.mean():.4f}")