cs3319-project2 / code /run_graph_features.py

CS3319 Project 2 final deliverable (public F1 = 0.96626)

f28d994 12 days ago

11.8 kB

	"""Graph structural features + LightGBM classifier for link prediction.

	A fundamentally different approach from GNN:
	- Explicitly compute graph statistics for each author-paper pair
	- Train a gradient boosting classifier on these features
	- Ensemble with GNN predictions for final submission
	"""
	import os
	import pickle as pkl
	import random
	from collections import defaultdict

	import numpy as np
	import pandas as pd
	from sklearn.preprocessing import StandardScaler
	from sklearn.metrics import f1_score, precision_recall_curve, roc_auc_score
	import lightgbm as lgb


	def set_seed(seed=0):
	random.seed(seed)
	np.random.seed(seed)


	set_seed(0)

	# ── Load data ─────────────────────────────────────────────────────
	base_path = "/home/lzc/cs3319-project"


	def read_txt(file):
	res_list = []
	with open(file, "r") as f:
	for line in f:
	res_list.append(list(map(int, line.strip().split())))
	return res_list


	train_edges = read_txt(os.path.join(base_path, "bipartite_train_ann.txt"))
	test_edges = read_txt(os.path.join(base_path, "bipartite_test_ann.txt"))
	coauthor = read_txt(os.path.join(base_path, "author_file_ann.txt"))
	citation = read_txt(os.path.join(base_path, "paper_file_ann.txt"))
	with open(os.path.join(base_path, "feature.pkl"), 'rb') as f:
	paper_feat = pkl.load(f).numpy().astype(np.float32)

	n_authors = 6611
	n_papers = 79937
	print(f"Authors: {n_authors}, Papers: {n_papers}")


	# ── Build lookup structures ───────────────────────────────────────
	def log1p_norm(x):
	x = np.log1p(x)
	return np.clip((x - x.mean()) / (x.std() + 1e-8), -5, 5)


	print("Building graph structures...")

	author_papers = defaultdict(set)
	for a, p in train_edges:
	author_papers[a].add(p)

	paper_authors = defaultdict(set)
	for a, p in train_edges:
	paper_authors[p].add(a)

	coauthor_set = defaultdict(set)
	for a1, a2 in coauthor:
	coauthor_set[a1].add(a2)
	coauthor_set[a2].add(a1)

	paper_cites_set = defaultdict(set)
	paper_cited_by_set = defaultdict(set)
	for p1, p2 in citation:
	paper_cites_set[p1].add(p2)
	paper_cited_by_set[p2].add(p1)

	# Degrees
	author_deg = np.array([len(author_papers[i]) for i in range(n_authors)], dtype=np.float32)
	paper_deg = np.array([len(paper_authors[i]) for i in range(n_papers)], dtype=np.float32)
	author_coauthor_deg = np.array([len(coauthor_set[i]) for i in range(n_authors)], dtype=np.float32)
	paper_cite_out = np.array([len(paper_cites_set[i]) for i in range(n_papers)], dtype=np.float32)
	paper_cite_in = np.array([len(paper_cited_by_set[i]) for i in range(n_papers)], dtype=np.float32)

	# Co-author papers
	coauthor_papers_set = defaultdict(set)
	for a in range(n_authors):
	for ca in coauthor_set[a]:
	coauthor_papers_set[a].update(author_papers[ca])

	# Author avg paper embedding
	from sklearn.preprocessing import normalize
	paper_feat_norm = normalize(paper_feat.astype(np.float64))
	author_avg_emb = np.zeros((n_authors, paper_feat.shape[1]), dtype=np.float32)
	for a in range(n_authors):
	if author_papers[a]:
	author_avg_emb[a] = paper_feat_norm[list(author_papers[a])].mean(axis=0).astype(np.float32)

	# Author embedding via LightGCN (load pre-computed if available, else use avg)
	# We'll use the avg embedding as a proxy for now
	# In final ensemble, we'll add GNN cosine scores as features too

	# Paper popularity percentile
	paper_pop_pct = np.zeros(n_papers, dtype=np.float32)
	deg_order = np.argsort(paper_deg)
	for i, idx in enumerate(deg_order):
	paper_pop_pct[idx] = i / n_papers

	author_pop_pct = np.zeros(n_authors, dtype=np.float32)
	deg_order = np.argsort(author_deg)
	for i, idx in enumerate(deg_order):
	author_pop_pct[idx] = i / n_authors

	# ── Feature computation ───────────────────────────────────────────
	def compute_features(pairs, batch_size=100000):
	"""Compute graph structural features for author-paper pairs."""
	n = len(pairs)
	all_feats = []

	for start in range(0, n, batch_size):
	end = min(start + batch_size, n)
	batch = pairs[start:end]
	authors = batch[:, 0]
	papers = batch[:, 1]
	m = len(authors)

	feats = np.zeros((m, 20), dtype=np.float32)

	# 0-4: Degree features
	feats[:, 0] = author_deg[authors]
	feats[:, 1] = paper_deg[papers]
	feats[:, 2] = author_coauthor_deg[authors]
	feats[:, 3] = paper_cite_in[papers]
	feats[:, 4] = paper_cite_out[papers]

	# 5: Preferential attachment
	feats[:, 5] = author_deg[authors] * paper_deg[papers]

	# 6-7: Log-transformed degrees
	feats[:, 6] = np.log1p(author_deg[authors])
	feats[:, 7] = np.log1p(paper_deg[papers])

	# 8-9: Popularity percentiles
	feats[:, 8] = author_pop_pct[authors]
	feats[:, 9] = paper_pop_pct[papers]

	# 10: Paper read by any co-author (binary)
	coauthor_reads = np.zeros(m, dtype=np.float32)
	for i in range(m):
	coauthor_reads[i] = float(papers[i] in coauthor_papers_set.get(authors[i], set()))
	feats[:, 10] = coauthor_reads

	# 11: Number of co-authors
	feats[:, 11] = np.array([len(coauthor_set.get(a, set())) for a in authors], dtype=np.float32)

	# 12-13: Paper citation degree / author degree ratio
	feats[:, 12] = paper_cite_in[papers] / (author_deg[authors] + 1)
	feats[:, 13] = paper_cite_out[papers] / (author_deg[authors] + 1)

	# 14: Cosine similarity between author avg embedding and paper embedding
	a_emb = author_avg_emb[authors]
	p_emb = paper_feat_norm[papers]
	feats[:, 14] = np.sum(a_emb * p_emb, axis=1)

	# 15: Paper degree / (author degree + paper degree)
	feats[:, 15] = paper_deg[papers] / (author_deg[authors] + paper_deg[papers] + 1)

	# 16-17: One-hot encoded degree buckets
	feats[:, 16] = (author_deg[authors] <= 5).astype(np.float32) # Cold-start author
	feats[:, 17] = (paper_deg[papers] <= 3).astype(np.float32) # Cold-start paper

	# 18-19: Combined degree percentiles
	feats[:, 18] = (author_pop_pct[authors] + paper_pop_pct[papers]) / 2
	feats[:, 19] = np.abs(author_pop_pct[authors] - paper_pop_pct[papers])

	all_feats.append(feats)

	return np.vstack(all_feats)


	# ── Prepare training data ─────────────────────────────────────────
	print("Preparing training data...")
	existing_set = set(map(tuple, train_edges))

	# Sample positives for training the feature model
	n_pos_train = min(200000, len(train_edges))
	pos_indices = np.random.choice(len(train_edges), n_pos_train, replace=False)
	train_pos = np.array(train_edges)[pos_indices]

	# Sample negatives (3x positives)
	n_neg_train = n_pos_train * 3
	neg_pairs = []
	while len(neg_pairs) < n_neg_train:
	a = np.random.randint(0, n_authors, size=n_neg_train * 2)
	p = np.random.randint(0, n_papers, size=n_neg_train * 2)
	for i in range(len(a)):
	if (a[i], p[i]) not in existing_set:
	neg_pairs.append((a[i], p[i]))
	if len(neg_pairs) >= n_neg_train:
	break
	train_neg = np.array(neg_pairs)

	print(f"Training samples: {len(train_pos)} pos + {len(train_neg)} neg = {len(train_pos) + len(train_neg)}")

	# Compute features
	print("Computing training features...")
	X_pos = compute_features(train_pos)
	X_neg = compute_features(train_neg)

	X_train = np.vstack([X_pos, X_neg])
	y_train = np.concatenate([np.ones(len(X_pos)), np.zeros(len(X_neg))])

	# Shuffle
	idx = np.random.permutation(len(X_train))
	X_train, y_train = X_train[idx], y_train[idx]

	# ── Validation set ────────────────────────────────────────────────
	print("Creating validation set...")
	n_val_pos = min(50000, len(train_edges) - n_pos_train)
	remaining = list(set(map(tuple, train_edges)) - set(map(tuple, train_pos.tolist())))
	val_pos_indices = np.random.choice(len(remaining), n_val_pos, replace=False)
	val_pos = np.array([remaining[i] for i in val_pos_indices])

	neg_val_pairs = []
	while len(neg_val_pairs) < n_val_pos:
	a = np.random.randint(0, n_authors, size=n_val_pos * 2)
	p = np.random.randint(0, n_papers, size=n_val_pos * 2)
	for i in range(len(a)):
	if (a[i], p[i]) not in existing_set:
	neg_val_pairs.append((a[i], p[i]))
	if len(neg_val_pairs) >= n_val_pos:
	break
	val_neg = np.array(neg_val_pairs)

	X_val_pos = compute_features(val_pos)
	X_val_neg = compute_features(val_neg)
	X_val = np.vstack([X_val_pos, X_val_neg])
	y_val = np.concatenate([np.ones(len(val_pos)), np.zeros(len(val_neg))])

	# ── Train LightGBM ────────────────────────────────────────────────
	print("Training LightGBM...")
	feature_names = [
	'author_deg', 'paper_deg', 'author_coauthor_deg',
	'paper_cite_in', 'paper_cite_out',
	'pref_attach',
	'log_author_deg', 'log_paper_deg',
	'author_pop_pct', 'paper_pop_pct',
	'coauthor_reads',
	'n_coauthors',
	'cite_in_ratio', 'cite_out_ratio',
	'cos_sim_author_paper',
	'paper_deg_ratio',
	'cold_start_author', 'cold_start_paper',
	'avg_pop_pct', 'pop_pct_diff',
	]

	model = lgb.LGBMClassifier(
	n_estimators=500,
	learning_rate=0.05,
	max_depth=8,
	num_leaves=63,
	subsample=0.8,
	colsample_bytree=0.8,
	min_child_samples=50,
	reg_alpha=0.1,
	reg_lambda=0.1,
	verbose=-1,
	random_state=0,
	n_jobs=-1,
	)

	model.fit(X_train, y_train)

	# Validation evaluation
	val_probs = model.predict_proba(X_val)[:, 1]
	precision, recall, thresholds = precision_recall_curve(y_val, val_probs)
	f1s = 2 * precision * recall / (precision + recall + 1e-12)
	best_idx = np.argmax(f1s)
	best_thresh = thresholds[best_idx] if best_idx < len(thresholds) else 0.5
	val_auc = roc_auc_score(y_val, val_probs)
	print(f"LightGBM val F1: {f1s[best_idx]:.4f}, AUC: {val_auc:.4f}, Thresh: {best_thresh:.4f}")

	# Feature importance
	importances = model.feature_importances_
	for name, imp in sorted(zip(feature_names, importances), key=lambda x: -x[1])[:10]:
	print(f" {name}: {imp:.4f}")

	# ── Predict test set ──────────────────────────────────────────────
	print("\nPredicting test set...")
	test_arr = np.array(test_edges, dtype=np.int64)
	X_test = compute_features(test_arr, batch_size=50000)
	test_probs = model.predict_proba(X_test)[:, 1]

	# Save model and features
	import joblib
	joblib.dump(model, '/home/lzc/lgb_model.pkl')

	# ── Generate submissions ──────────────────────────────────────────
	train_set_full = set(map(tuple, train_edges))
	overlap = train_set_full & set(map(tuple, test_edges))
	known_mask = np.array([tuple(p) in overlap for p in test_edges])

	# Save raw scores
	np.save('/home/lzc/test_lgb_scores.npy', test_probs)
	np.save('/home/lzc/test_known_mask.npy', known_mask)

	# Try different thresholds
	for thresh in np.arange(0.30, 0.71, 0.05):
	preds = (test_probs >= thresh).astype(int)
	path = f'/home/lzc/sub_lgb_t{thresh:.2f}.csv'
	data_out = [[idx, str(int(p))] for idx, p in enumerate(preds)]
	pd.DataFrame(data_out, columns=['Index', 'Predicted'], dtype=object).to_csv(path, index=False)
	print(f" t={thresh:.2f}: pos={preds.mean():.4f}")

	print("\nLightGBM model and predictions saved!")