File size: 13,868 Bytes

f28d994

"""Adapted from project-example-2026-pygver.ipynb for local execution."""
import os
import pickle as pkl
import random

import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
import tqdm

from torch_geometric.data import HeteroData
from sklearn.metrics import f1_score, precision_recall_curve

device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
print('torch:', torch.__version__)
print('device:', device)


def set_seed(seed=0):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)


set_seed(0)

# ── Paths ──────────────────────────────────────────────────────────
base_path = "/home/lzc/cs3319-project"

cite_file = os.path.join(base_path, "paper_file_ann.txt")
train_ref_file = os.path.join(base_path, "bipartite_train_ann.txt")
test_ref_file = os.path.join(base_path, "bipartite_test_ann.txt")
coauthor_file = os.path.join(base_path, "author_file_ann.txt")
feature_file = os.path.join(base_path, "feature.pkl")


def read_txt(file):
    res_list = []
    with open(file, "r") as f:
        for line in f:
            res_list.append(list(map(int, line.strip().split())))
    return res_list


citation = read_txt(cite_file)
existing_refs = read_txt(train_ref_file)
refs_to_pred = read_txt(test_ref_file)
coauthor = read_txt(coauthor_file)

with open(feature_file, 'rb') as f:
    paper_feature = pkl.load(f)

print(f"Number of citation edges: {len(citation)}")
print(f"Number of existing references: {len(existing_refs)}")
print(f"Number of author-paper pairs to predict: {len(refs_to_pred)}")
print(f"Number of coauthor edges: {len(coauthor)}")
print(f"Shape of paper features: {paper_feature.shape}")

# ── Build edge dataframes ─────────────────────────────────────────
cite_edges = pd.DataFrame(citation, columns=['source', 'target'])
cite_edges = cite_edges.set_index("c-" + cite_edges.index.astype(str))

ref_edges = pd.DataFrame(existing_refs, columns=['source', 'target'])
ref_edges = ref_edges.set_index("r-" + ref_edges.index.astype(str))

coauthor_edges = pd.DataFrame(coauthor, columns=['source', 'target'])
coauthor_edges = coauthor_edges.set_index("a-" + coauthor_edges.index.astype(str))

# ── Build node DataFrames ─────────────────────────────────────────
node_tmp = pd.concat([cite_edges['source'], cite_edges['target'], ref_edges['target']])
node_papers = pd.DataFrame(index=pd.unique(node_tmp))

node_tmp = pd.concat([ref_edges['source'], coauthor_edges['source'], coauthor_edges['target']])
node_authors = pd.DataFrame(index=pd.unique(node_tmp))

print(f"Number of paper nodes: {len(node_papers)}, number of author nodes: {len(node_authors)}")

# ── Train / validation split ──────────────────────────────────────
train_refs = ref_edges.sample(frac=0.9, random_state=0, axis=0)
test_true_refs = ref_edges[~ref_edges.index.isin(train_refs.index)].copy()
test_true_refs['label'] = 1

existing_ref_set = set(map(tuple, ref_edges[['source', 'target']].to_numpy().tolist()))
num_test_pos = len(test_true_refs)
author_ids = node_authors.index.to_numpy(dtype=np.int64)
paper_ids = node_papers.index.to_numpy(dtype=np.int64)

neg_pairs = []
rng = np.random.default_rng(0)
while len(neg_pairs) < num_test_pos:
    src = int(rng.choice(author_ids))
    dst = int(rng.choice(paper_ids))
    if (src, dst) not in existing_ref_set:
        neg_pairs.append((src, dst))

test_false_refs = pd.DataFrame(neg_pairs, columns=['source', 'target'])
test_false_refs['label'] = 0

test_refs = pd.concat([test_true_refs, test_false_refs], ignore_index=True)
test_refs = test_refs.sample(frac=1, random_state=0, axis=0).reset_index(drop=True)
print(f"Validation set: {len(test_refs)} pairs "
      f"(pos={test_refs['label'].sum()}, neg={len(test_refs) - test_refs['label'].sum()})")

# ── Build HeteroData ──────────────────────────────────────────────
train_ref_tensor = torch.as_tensor(train_refs[['source', 'target']].to_numpy(), dtype=torch.long)
cite_tensor = torch.as_tensor(cite_edges[['source', 'target']].to_numpy(), dtype=torch.long)
coauthor_tensor = torch.as_tensor(coauthor_edges[['source', 'target']].to_numpy(), dtype=torch.long)

test_ref_arr = np.array(refs_to_pred, dtype=np.int64) if len(refs_to_pred) > 0 else np.zeros((0, 2), dtype=np.int64)
num_authors = int(max(
    ref_edges['source'].max(),
    coauthor_edges['source'].max(),
    coauthor_edges['target'].max(),
    test_ref_arr[:, 0].max() if len(test_ref_arr) else 0,
) + 1)
num_papers = int(max(
    cite_edges['source'].max(),
    cite_edges['target'].max(),
    ref_edges['target'].max(),
    test_ref_arr[:, 1].max() if len(test_ref_arr) else 0,
    paper_feature.shape[0] - 1,
) + 1)

paper_x = torch.as_tensor(paper_feature, dtype=torch.float)
if paper_x.size(0) < num_papers:
    pad = torch.zeros(num_papers - paper_x.size(0), paper_x.size(1), dtype=paper_x.dtype)
    paper_x = torch.cat([paper_x, pad], dim=0)
elif paper_x.size(0) > num_papers:
    paper_x = paper_x[:num_papers]

data = HeteroData()
data['author'].num_nodes = num_authors
data['paper'].x = paper_x
data['paper'].num_nodes = num_papers
data['author', 'ref', 'paper'].edge_index = train_ref_tensor.t().contiguous()
data['paper', 'beref', 'author'].edge_index = train_ref_tensor[:, [1, 0]].t().contiguous()
data['paper', 'cite', 'paper'].edge_index = torch.cat([
    cite_tensor,
    cite_tensor[:, [1, 0]],
], dim=0).t().contiguous()
data['author', 'coauthor', 'author'].edge_index = torch.cat([
    coauthor_tensor,
    coauthor_tensor[:, [1, 0]],
], dim=0).t().contiguous()

data = data.to(device)
print(data)
print('metadata:', data.metadata())


# ── Model ─────────────────────────────────────────────────────────
class HeteroMeanConv(nn.Module):
    def __init__(self, metadata, in_dims, out_dim):
        super().__init__()
        node_types, edge_types = metadata
        self.node_types = list(node_types)
        self.edge_types = list(edge_types)
        self.rel_lins = nn.ModuleDict({
            self._key(edge_type): nn.Linear(in_dims[edge_type[0]], out_dim, bias=False)
            for edge_type in self.edge_types
        })
        self.self_lins = nn.ModuleDict({
            node_type: nn.Linear(in_dims[node_type], out_dim)
            for node_type in self.node_types
        })

    @staticmethod
    def _key(edge_type):
        return '__'.join(edge_type)

    def reset_parameters(self):
        for layer in self.rel_lins.values():
            layer.reset_parameters()
        for layer in self.self_lins.values():
            layer.reset_parameters()

    def forward(self, x_dict, edge_index_dict, num_nodes_dict):
        out_dict = {
            node_type: self.self_lins[node_type](x_dict[node_type])
            for node_type in self.node_types
        }
        rel_count = {node_type: 1 for node_type in self.node_types}

        for edge_type, edge_index in edge_index_dict.items():
            src_type, _, dst_type = edge_type
            src, dst = edge_index
            src_x = x_dict[src_type]
            agg = src_x.new_zeros((num_nodes_dict[dst_type], src_x.size(-1)))
            deg = src_x.new_zeros((num_nodes_dict[dst_type], 1))
            agg.index_add_(0, dst, src_x[src])
            deg.index_add_(
                0, dst,
                torch.ones((dst.numel(), 1), dtype=src_x.dtype, device=src_x.device),
            )
            agg = agg / deg.clamp(min=1.0)
            out_dict[dst_type] = out_dict[dst_type] + self.rel_lins[self._key(edge_type)](agg)
            rel_count[dst_type] += 1

        return {
            node_type: out_dict[node_type] / rel_count[node_type]
            for node_type in self.node_types
        }


class HeteroRecommender(nn.Module):
    def __init__(self, metadata, paper_in_dim, hidden_dim=64, out_dim=10, author_in_dim=512):
        super().__init__()
        self.author_emb = nn.Embedding(num_authors, author_in_dim)
        self.paper_lin = nn.Linear(paper_in_dim, author_in_dim)
        self.num_nodes_dict = {'author': num_authors, 'paper': num_papers}

        self.conv1 = HeteroMeanConv(
            metadata,
            in_dims={'author': author_in_dim, 'paper': author_in_dim},
            out_dim=hidden_dim,
        )
        self.conv2 = HeteroMeanConv(
            metadata,
            in_dims={'author': hidden_dim, 'paper': hidden_dim},
            out_dim=out_dim,
        )
        self.reset_parameters()

    def reset_parameters(self):
        nn.init.xavier_uniform_(self.author_emb.weight)
        self.paper_lin.reset_parameters()
        self.conv1.reset_parameters()
        self.conv2.reset_parameters()

    def encode(self, data):
        x_dict = {
            'author': self.author_emb.weight,
            'paper': self.paper_lin(data['paper'].x),
        }
        x_dict = self.conv1(x_dict, data.edge_index_dict, self.num_nodes_dict)
        x_dict = {k: F.relu(v) for k, v in x_dict.items()}
        x_dict = self.conv2(x_dict, data.edge_index_dict, self.num_nodes_dict)
        return x_dict

    def decode(self, z_dict, edge_label_index):
        src, dst = edge_label_index
        return (z_dict['author'][src] * z_dict['paper'][dst]).sum(dim=-1)


def sample_negative_edges(num_samples, num_authors, num_papers, existing_edges, device):
    neg_edges = []
    while len(neg_edges) < num_samples:
        need = num_samples - len(neg_edges)
        src = torch.randint(0, num_authors, (need * 2,), device='cpu')
        dst = torch.randint(0, num_papers, (need * 2,), device='cpu')
        for s, d in zip(src.tolist(), dst.tolist()):
            if (s, d) not in existing_edges:
                neg_edges.append((s, d))
                if len(neg_edges) == num_samples:
                    break
    return torch.tensor(neg_edges, dtype=torch.long, device=device).t().contiguous()


# ── Training ──────────────────────────────────────────────────────
model = HeteroRecommender(
    data.metadata(),
    paper_in_dim=data['paper'].x.size(-1),
    hidden_dim=64,
    out_dim=10,
    author_in_dim=512,
).to(device)

optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=1e-5)

pos_edge_index = data['author', 'ref', 'paper'].edge_index
existing_train_set = set(map(tuple, train_refs[['source', 'target']].to_numpy().tolist()))

train_edge_batch_size = min(32768, pos_edge_index.size(1))
num_epochs = 60

print(f"\nTraining {num_epochs} epochs (batch_size={train_edge_batch_size})...")
for epoch in range(num_epochs):
    model.train()
    optimizer.zero_grad()

    batch_perm = torch.randperm(pos_edge_index.size(1), device=device)[:train_edge_batch_size]
    pos_batch = pos_edge_index[:, batch_perm]
    neg_batch = sample_negative_edges(
        pos_batch.size(1), num_authors, num_papers, existing_train_set, device
    )

    z_dict = model.encode(data)
    pos_score = model.decode(z_dict, pos_batch)
    neg_score = model.decode(z_dict, neg_batch)

    loss = (1.0 - pos_score + neg_score).clamp(min=0).mean()
    loss.backward()
    optimizer.step()

    if epoch % 10 == 0 or epoch == num_epochs - 1:
        print(f'Epoch {epoch:03d}, loss={loss.item():.4f}')

# ── Evaluation on validation set ──────────────────────────────────
with torch.no_grad():
    model.eval()
    node_embeddings = model.encode(data)
    node_embeddings = {k: v.detach().cpu() for k, v in node_embeddings.items()}

from numpy.linalg import norm


def cos_sim(a, b, eps=1e-12):
    return np.sum(a * b, axis=1) / (norm(a, axis=1) * norm(b, axis=1) + eps)


test_arr = test_refs[['source', 'target']].to_numpy(dtype=np.int64)
res = cos_sim(
    node_embeddings['author'][test_arr[:, 0]].numpy(),
    node_embeddings['paper'][test_arr[:, 1]].numpy(),
)

lbl_true = test_refs['label'].to_numpy().flatten()

# Threshold search for best F1
precision, recall, thresholds = precision_recall_curve(lbl_true, np.array(res))

# Find best F1 threshold
f1_scores = 2 * precision * recall / (precision + recall + 1e-12)
best_idx = np.argmax(f1_scores)
best_threshold = thresholds[best_idx] if best_idx < len(thresholds) else 0.5
best_f1 = f1_scores[best_idx]

print(f"\nBest threshold (val): {best_threshold:.4f}, Best F1 (val): {best_f1:.4f}")

# F1 at threshold=0.5
lbl_pred_05 = (np.array(res) >= 0.5).astype(int)
print(f"F1 @ threshold=0.5: {f1_score(lbl_true, lbl_pred_05):.4f}")

# ── Generate submission ───────────────────────────────────────────
output_path = os.path.join("/home/lzc", "submission.csv")
test_arr_final = np.array(refs_to_pred, dtype=np.int64)
res_final = cos_sim(
    node_embeddings['author'][test_arr_final[:, 0]].numpy(),
    node_embeddings['paper'][test_arr_final[:, 1]].numpy(),
)

res_pred = (res_final >= best_threshold).astype(int)

data_out = [[idx, str(int(p))] for idx, p in enumerate(res_pred)]
df = pd.DataFrame(data_out, columns=['Index', 'Predicted'], dtype=object)
df.to_csv(output_path, index=False)
print(f"\nSubmission saved to: {output_path}")
print(f"Predicted positive ratio: {res_pred.mean():.4f}")