"""Graph structural features + LightGBM classifier for link prediction. A fundamentally different approach from GNN: - Explicitly compute graph statistics for each author-paper pair - Train a gradient boosting classifier on these features - Ensemble with GNN predictions for final submission """ import os import pickle as pkl import random from collections import defaultdict import numpy as np import pandas as pd from sklearn.preprocessing import StandardScaler from sklearn.metrics import f1_score, precision_recall_curve, roc_auc_score import lightgbm as lgb def set_seed(seed=0): random.seed(seed) np.random.seed(seed) set_seed(0) # ── Load data ───────────────────────────────────────────────────── base_path = "/home/lzc/cs3319-project" def read_txt(file): res_list = [] with open(file, "r") as f: for line in f: res_list.append(list(map(int, line.strip().split()))) return res_list train_edges = read_txt(os.path.join(base_path, "bipartite_train_ann.txt")) test_edges = read_txt(os.path.join(base_path, "bipartite_test_ann.txt")) coauthor = read_txt(os.path.join(base_path, "author_file_ann.txt")) citation = read_txt(os.path.join(base_path, "paper_file_ann.txt")) with open(os.path.join(base_path, "feature.pkl"), 'rb') as f: paper_feat = pkl.load(f).numpy().astype(np.float32) n_authors = 6611 n_papers = 79937 print(f"Authors: {n_authors}, Papers: {n_papers}") # ── Build lookup structures ─────────────────────────────────────── def log1p_norm(x): x = np.log1p(x) return np.clip((x - x.mean()) / (x.std() + 1e-8), -5, 5) print("Building graph structures...") author_papers = defaultdict(set) for a, p in train_edges: author_papers[a].add(p) paper_authors = defaultdict(set) for a, p in train_edges: paper_authors[p].add(a) coauthor_set = defaultdict(set) for a1, a2 in coauthor: coauthor_set[a1].add(a2) coauthor_set[a2].add(a1) paper_cites_set = defaultdict(set) paper_cited_by_set = defaultdict(set) for p1, p2 in citation: paper_cites_set[p1].add(p2) paper_cited_by_set[p2].add(p1) # Degrees author_deg = np.array([len(author_papers[i]) for i in range(n_authors)], dtype=np.float32) paper_deg = np.array([len(paper_authors[i]) for i in range(n_papers)], dtype=np.float32) author_coauthor_deg = np.array([len(coauthor_set[i]) for i in range(n_authors)], dtype=np.float32) paper_cite_out = np.array([len(paper_cites_set[i]) for i in range(n_papers)], dtype=np.float32) paper_cite_in = np.array([len(paper_cited_by_set[i]) for i in range(n_papers)], dtype=np.float32) # Co-author papers coauthor_papers_set = defaultdict(set) for a in range(n_authors): for ca in coauthor_set[a]: coauthor_papers_set[a].update(author_papers[ca]) # Author avg paper embedding from sklearn.preprocessing import normalize paper_feat_norm = normalize(paper_feat.astype(np.float64)) author_avg_emb = np.zeros((n_authors, paper_feat.shape[1]), dtype=np.float32) for a in range(n_authors): if author_papers[a]: author_avg_emb[a] = paper_feat_norm[list(author_papers[a])].mean(axis=0).astype(np.float32) # Author embedding via LightGCN (load pre-computed if available, else use avg) # We'll use the avg embedding as a proxy for now # In final ensemble, we'll add GNN cosine scores as features too # Paper popularity percentile paper_pop_pct = np.zeros(n_papers, dtype=np.float32) deg_order = np.argsort(paper_deg) for i, idx in enumerate(deg_order): paper_pop_pct[idx] = i / n_papers author_pop_pct = np.zeros(n_authors, dtype=np.float32) deg_order = np.argsort(author_deg) for i, idx in enumerate(deg_order): author_pop_pct[idx] = i / n_authors # ── Feature computation ─────────────────────────────────────────── def compute_features(pairs, batch_size=100000): """Compute graph structural features for author-paper pairs.""" n = len(pairs) all_feats = [] for start in range(0, n, batch_size): end = min(start + batch_size, n) batch = pairs[start:end] authors = batch[:, 0] papers = batch[:, 1] m = len(authors) feats = np.zeros((m, 20), dtype=np.float32) # 0-4: Degree features feats[:, 0] = author_deg[authors] feats[:, 1] = paper_deg[papers] feats[:, 2] = author_coauthor_deg[authors] feats[:, 3] = paper_cite_in[papers] feats[:, 4] = paper_cite_out[papers] # 5: Preferential attachment feats[:, 5] = author_deg[authors] * paper_deg[papers] # 6-7: Log-transformed degrees feats[:, 6] = np.log1p(author_deg[authors]) feats[:, 7] = np.log1p(paper_deg[papers]) # 8-9: Popularity percentiles feats[:, 8] = author_pop_pct[authors] feats[:, 9] = paper_pop_pct[papers] # 10: Paper read by any co-author (binary) coauthor_reads = np.zeros(m, dtype=np.float32) for i in range(m): coauthor_reads[i] = float(papers[i] in coauthor_papers_set.get(authors[i], set())) feats[:, 10] = coauthor_reads # 11: Number of co-authors feats[:, 11] = np.array([len(coauthor_set.get(a, set())) for a in authors], dtype=np.float32) # 12-13: Paper citation degree / author degree ratio feats[:, 12] = paper_cite_in[papers] / (author_deg[authors] + 1) feats[:, 13] = paper_cite_out[papers] / (author_deg[authors] + 1) # 14: Cosine similarity between author avg embedding and paper embedding a_emb = author_avg_emb[authors] p_emb = paper_feat_norm[papers] feats[:, 14] = np.sum(a_emb * p_emb, axis=1) # 15: Paper degree / (author degree + paper degree) feats[:, 15] = paper_deg[papers] / (author_deg[authors] + paper_deg[papers] + 1) # 16-17: One-hot encoded degree buckets feats[:, 16] = (author_deg[authors] <= 5).astype(np.float32) # Cold-start author feats[:, 17] = (paper_deg[papers] <= 3).astype(np.float32) # Cold-start paper # 18-19: Combined degree percentiles feats[:, 18] = (author_pop_pct[authors] + paper_pop_pct[papers]) / 2 feats[:, 19] = np.abs(author_pop_pct[authors] - paper_pop_pct[papers]) all_feats.append(feats) return np.vstack(all_feats) # ── Prepare training data ───────────────────────────────────────── print("Preparing training data...") existing_set = set(map(tuple, train_edges)) # Sample positives for training the feature model n_pos_train = min(200000, len(train_edges)) pos_indices = np.random.choice(len(train_edges), n_pos_train, replace=False) train_pos = np.array(train_edges)[pos_indices] # Sample negatives (3x positives) n_neg_train = n_pos_train * 3 neg_pairs = [] while len(neg_pairs) < n_neg_train: a = np.random.randint(0, n_authors, size=n_neg_train * 2) p = np.random.randint(0, n_papers, size=n_neg_train * 2) for i in range(len(a)): if (a[i], p[i]) not in existing_set: neg_pairs.append((a[i], p[i])) if len(neg_pairs) >= n_neg_train: break train_neg = np.array(neg_pairs) print(f"Training samples: {len(train_pos)} pos + {len(train_neg)} neg = {len(train_pos) + len(train_neg)}") # Compute features print("Computing training features...") X_pos = compute_features(train_pos) X_neg = compute_features(train_neg) X_train = np.vstack([X_pos, X_neg]) y_train = np.concatenate([np.ones(len(X_pos)), np.zeros(len(X_neg))]) # Shuffle idx = np.random.permutation(len(X_train)) X_train, y_train = X_train[idx], y_train[idx] # ── Validation set ──────────────────────────────────────────────── print("Creating validation set...") n_val_pos = min(50000, len(train_edges) - n_pos_train) remaining = list(set(map(tuple, train_edges)) - set(map(tuple, train_pos.tolist()))) val_pos_indices = np.random.choice(len(remaining), n_val_pos, replace=False) val_pos = np.array([remaining[i] for i in val_pos_indices]) neg_val_pairs = [] while len(neg_val_pairs) < n_val_pos: a = np.random.randint(0, n_authors, size=n_val_pos * 2) p = np.random.randint(0, n_papers, size=n_val_pos * 2) for i in range(len(a)): if (a[i], p[i]) not in existing_set: neg_val_pairs.append((a[i], p[i])) if len(neg_val_pairs) >= n_val_pos: break val_neg = np.array(neg_val_pairs) X_val_pos = compute_features(val_pos) X_val_neg = compute_features(val_neg) X_val = np.vstack([X_val_pos, X_val_neg]) y_val = np.concatenate([np.ones(len(val_pos)), np.zeros(len(val_neg))]) # ── Train LightGBM ──────────────────────────────────────────────── print("Training LightGBM...") feature_names = [ 'author_deg', 'paper_deg', 'author_coauthor_deg', 'paper_cite_in', 'paper_cite_out', 'pref_attach', 'log_author_deg', 'log_paper_deg', 'author_pop_pct', 'paper_pop_pct', 'coauthor_reads', 'n_coauthors', 'cite_in_ratio', 'cite_out_ratio', 'cos_sim_author_paper', 'paper_deg_ratio', 'cold_start_author', 'cold_start_paper', 'avg_pop_pct', 'pop_pct_diff', ] model = lgb.LGBMClassifier( n_estimators=500, learning_rate=0.05, max_depth=8, num_leaves=63, subsample=0.8, colsample_bytree=0.8, min_child_samples=50, reg_alpha=0.1, reg_lambda=0.1, verbose=-1, random_state=0, n_jobs=-1, ) model.fit(X_train, y_train) # Validation evaluation val_probs = model.predict_proba(X_val)[:, 1] precision, recall, thresholds = precision_recall_curve(y_val, val_probs) f1s = 2 * precision * recall / (precision + recall + 1e-12) best_idx = np.argmax(f1s) best_thresh = thresholds[best_idx] if best_idx < len(thresholds) else 0.5 val_auc = roc_auc_score(y_val, val_probs) print(f"LightGBM val F1: {f1s[best_idx]:.4f}, AUC: {val_auc:.4f}, Thresh: {best_thresh:.4f}") # Feature importance importances = model.feature_importances_ for name, imp in sorted(zip(feature_names, importances), key=lambda x: -x[1])[:10]: print(f" {name}: {imp:.4f}") # ── Predict test set ────────────────────────────────────────────── print("\nPredicting test set...") test_arr = np.array(test_edges, dtype=np.int64) X_test = compute_features(test_arr, batch_size=50000) test_probs = model.predict_proba(X_test)[:, 1] # Save model and features import joblib joblib.dump(model, '/home/lzc/lgb_model.pkl') # ── Generate submissions ────────────────────────────────────────── train_set_full = set(map(tuple, train_edges)) overlap = train_set_full & set(map(tuple, test_edges)) known_mask = np.array([tuple(p) in overlap for p in test_edges]) # Save raw scores np.save('/home/lzc/test_lgb_scores.npy', test_probs) np.save('/home/lzc/test_known_mask.npy', known_mask) # Try different thresholds for thresh in np.arange(0.30, 0.71, 0.05): preds = (test_probs >= thresh).astype(int) path = f'/home/lzc/sub_lgb_t{thresh:.2f}.csv' data_out = [[idx, str(int(p))] for idx, p in enumerate(preds)] pd.DataFrame(data_out, columns=['Index', 'Predicted'], dtype=object).to_csv(path, index=False) print(f" t={thresh:.2f}: pos={preds.mean():.4f}") print("\nLightGBM model and predictions saved!")