| """Graph structural features + LightGBM classifier for link prediction. |
| |
| A fundamentally different approach from GNN: |
| - Explicitly compute graph statistics for each author-paper pair |
| - Train a gradient boosting classifier on these features |
| - Ensemble with GNN predictions for final submission |
| """ |
| import os |
| import pickle as pkl |
| import random |
| from collections import defaultdict |
|
|
| import numpy as np |
| import pandas as pd |
| from sklearn.preprocessing import StandardScaler |
| from sklearn.metrics import f1_score, precision_recall_curve, roc_auc_score |
| import lightgbm as lgb |
|
|
|
|
| def set_seed(seed=0): |
| random.seed(seed) |
| np.random.seed(seed) |
|
|
|
|
| set_seed(0) |
|
|
| |
| base_path = "/home/lzc/cs3319-project" |
|
|
|
|
| def read_txt(file): |
| res_list = [] |
| with open(file, "r") as f: |
| for line in f: |
| res_list.append(list(map(int, line.strip().split()))) |
| return res_list |
|
|
|
|
| train_edges = read_txt(os.path.join(base_path, "bipartite_train_ann.txt")) |
| test_edges = read_txt(os.path.join(base_path, "bipartite_test_ann.txt")) |
| coauthor = read_txt(os.path.join(base_path, "author_file_ann.txt")) |
| citation = read_txt(os.path.join(base_path, "paper_file_ann.txt")) |
| with open(os.path.join(base_path, "feature.pkl"), 'rb') as f: |
| paper_feat = pkl.load(f).numpy().astype(np.float32) |
|
|
| n_authors = 6611 |
| n_papers = 79937 |
| print(f"Authors: {n_authors}, Papers: {n_papers}") |
|
|
|
|
| |
| def log1p_norm(x): |
| x = np.log1p(x) |
| return np.clip((x - x.mean()) / (x.std() + 1e-8), -5, 5) |
|
|
|
|
| print("Building graph structures...") |
|
|
| author_papers = defaultdict(set) |
| for a, p in train_edges: |
| author_papers[a].add(p) |
|
|
| paper_authors = defaultdict(set) |
| for a, p in train_edges: |
| paper_authors[p].add(a) |
|
|
| coauthor_set = defaultdict(set) |
| for a1, a2 in coauthor: |
| coauthor_set[a1].add(a2) |
| coauthor_set[a2].add(a1) |
|
|
| paper_cites_set = defaultdict(set) |
| paper_cited_by_set = defaultdict(set) |
| for p1, p2 in citation: |
| paper_cites_set[p1].add(p2) |
| paper_cited_by_set[p2].add(p1) |
|
|
| |
| author_deg = np.array([len(author_papers[i]) for i in range(n_authors)], dtype=np.float32) |
| paper_deg = np.array([len(paper_authors[i]) for i in range(n_papers)], dtype=np.float32) |
| author_coauthor_deg = np.array([len(coauthor_set[i]) for i in range(n_authors)], dtype=np.float32) |
| paper_cite_out = np.array([len(paper_cites_set[i]) for i in range(n_papers)], dtype=np.float32) |
| paper_cite_in = np.array([len(paper_cited_by_set[i]) for i in range(n_papers)], dtype=np.float32) |
|
|
| |
| coauthor_papers_set = defaultdict(set) |
| for a in range(n_authors): |
| for ca in coauthor_set[a]: |
| coauthor_papers_set[a].update(author_papers[ca]) |
|
|
| |
| from sklearn.preprocessing import normalize |
| paper_feat_norm = normalize(paper_feat.astype(np.float64)) |
| author_avg_emb = np.zeros((n_authors, paper_feat.shape[1]), dtype=np.float32) |
| for a in range(n_authors): |
| if author_papers[a]: |
| author_avg_emb[a] = paper_feat_norm[list(author_papers[a])].mean(axis=0).astype(np.float32) |
|
|
| |
| |
| |
|
|
| |
| paper_pop_pct = np.zeros(n_papers, dtype=np.float32) |
| deg_order = np.argsort(paper_deg) |
| for i, idx in enumerate(deg_order): |
| paper_pop_pct[idx] = i / n_papers |
|
|
| author_pop_pct = np.zeros(n_authors, dtype=np.float32) |
| deg_order = np.argsort(author_deg) |
| for i, idx in enumerate(deg_order): |
| author_pop_pct[idx] = i / n_authors |
|
|
| |
| def compute_features(pairs, batch_size=100000): |
| """Compute graph structural features for author-paper pairs.""" |
| n = len(pairs) |
| all_feats = [] |
|
|
| for start in range(0, n, batch_size): |
| end = min(start + batch_size, n) |
| batch = pairs[start:end] |
| authors = batch[:, 0] |
| papers = batch[:, 1] |
| m = len(authors) |
|
|
| feats = np.zeros((m, 20), dtype=np.float32) |
|
|
| |
| feats[:, 0] = author_deg[authors] |
| feats[:, 1] = paper_deg[papers] |
| feats[:, 2] = author_coauthor_deg[authors] |
| feats[:, 3] = paper_cite_in[papers] |
| feats[:, 4] = paper_cite_out[papers] |
|
|
| |
| feats[:, 5] = author_deg[authors] * paper_deg[papers] |
|
|
| |
| feats[:, 6] = np.log1p(author_deg[authors]) |
| feats[:, 7] = np.log1p(paper_deg[papers]) |
|
|
| |
| feats[:, 8] = author_pop_pct[authors] |
| feats[:, 9] = paper_pop_pct[papers] |
|
|
| |
| coauthor_reads = np.zeros(m, dtype=np.float32) |
| for i in range(m): |
| coauthor_reads[i] = float(papers[i] in coauthor_papers_set.get(authors[i], set())) |
| feats[:, 10] = coauthor_reads |
|
|
| |
| feats[:, 11] = np.array([len(coauthor_set.get(a, set())) for a in authors], dtype=np.float32) |
|
|
| |
| feats[:, 12] = paper_cite_in[papers] / (author_deg[authors] + 1) |
| feats[:, 13] = paper_cite_out[papers] / (author_deg[authors] + 1) |
|
|
| |
| a_emb = author_avg_emb[authors] |
| p_emb = paper_feat_norm[papers] |
| feats[:, 14] = np.sum(a_emb * p_emb, axis=1) |
|
|
| |
| feats[:, 15] = paper_deg[papers] / (author_deg[authors] + paper_deg[papers] + 1) |
|
|
| |
| feats[:, 16] = (author_deg[authors] <= 5).astype(np.float32) |
| feats[:, 17] = (paper_deg[papers] <= 3).astype(np.float32) |
|
|
| |
| feats[:, 18] = (author_pop_pct[authors] + paper_pop_pct[papers]) / 2 |
| feats[:, 19] = np.abs(author_pop_pct[authors] - paper_pop_pct[papers]) |
|
|
| all_feats.append(feats) |
|
|
| return np.vstack(all_feats) |
|
|
|
|
| |
| print("Preparing training data...") |
| existing_set = set(map(tuple, train_edges)) |
|
|
| |
| n_pos_train = min(200000, len(train_edges)) |
| pos_indices = np.random.choice(len(train_edges), n_pos_train, replace=False) |
| train_pos = np.array(train_edges)[pos_indices] |
|
|
| |
| n_neg_train = n_pos_train * 3 |
| neg_pairs = [] |
| while len(neg_pairs) < n_neg_train: |
| a = np.random.randint(0, n_authors, size=n_neg_train * 2) |
| p = np.random.randint(0, n_papers, size=n_neg_train * 2) |
| for i in range(len(a)): |
| if (a[i], p[i]) not in existing_set: |
| neg_pairs.append((a[i], p[i])) |
| if len(neg_pairs) >= n_neg_train: |
| break |
| train_neg = np.array(neg_pairs) |
|
|
| print(f"Training samples: {len(train_pos)} pos + {len(train_neg)} neg = {len(train_pos) + len(train_neg)}") |
|
|
| |
| print("Computing training features...") |
| X_pos = compute_features(train_pos) |
| X_neg = compute_features(train_neg) |
|
|
| X_train = np.vstack([X_pos, X_neg]) |
| y_train = np.concatenate([np.ones(len(X_pos)), np.zeros(len(X_neg))]) |
|
|
| |
| idx = np.random.permutation(len(X_train)) |
| X_train, y_train = X_train[idx], y_train[idx] |
|
|
| |
| print("Creating validation set...") |
| n_val_pos = min(50000, len(train_edges) - n_pos_train) |
| remaining = list(set(map(tuple, train_edges)) - set(map(tuple, train_pos.tolist()))) |
| val_pos_indices = np.random.choice(len(remaining), n_val_pos, replace=False) |
| val_pos = np.array([remaining[i] for i in val_pos_indices]) |
|
|
| neg_val_pairs = [] |
| while len(neg_val_pairs) < n_val_pos: |
| a = np.random.randint(0, n_authors, size=n_val_pos * 2) |
| p = np.random.randint(0, n_papers, size=n_val_pos * 2) |
| for i in range(len(a)): |
| if (a[i], p[i]) not in existing_set: |
| neg_val_pairs.append((a[i], p[i])) |
| if len(neg_val_pairs) >= n_val_pos: |
| break |
| val_neg = np.array(neg_val_pairs) |
|
|
| X_val_pos = compute_features(val_pos) |
| X_val_neg = compute_features(val_neg) |
| X_val = np.vstack([X_val_pos, X_val_neg]) |
| y_val = np.concatenate([np.ones(len(val_pos)), np.zeros(len(val_neg))]) |
|
|
| |
| print("Training LightGBM...") |
| feature_names = [ |
| 'author_deg', 'paper_deg', 'author_coauthor_deg', |
| 'paper_cite_in', 'paper_cite_out', |
| 'pref_attach', |
| 'log_author_deg', 'log_paper_deg', |
| 'author_pop_pct', 'paper_pop_pct', |
| 'coauthor_reads', |
| 'n_coauthors', |
| 'cite_in_ratio', 'cite_out_ratio', |
| 'cos_sim_author_paper', |
| 'paper_deg_ratio', |
| 'cold_start_author', 'cold_start_paper', |
| 'avg_pop_pct', 'pop_pct_diff', |
| ] |
|
|
| model = lgb.LGBMClassifier( |
| n_estimators=500, |
| learning_rate=0.05, |
| max_depth=8, |
| num_leaves=63, |
| subsample=0.8, |
| colsample_bytree=0.8, |
| min_child_samples=50, |
| reg_alpha=0.1, |
| reg_lambda=0.1, |
| verbose=-1, |
| random_state=0, |
| n_jobs=-1, |
| ) |
|
|
| model.fit(X_train, y_train) |
|
|
| |
| val_probs = model.predict_proba(X_val)[:, 1] |
| precision, recall, thresholds = precision_recall_curve(y_val, val_probs) |
| f1s = 2 * precision * recall / (precision + recall + 1e-12) |
| best_idx = np.argmax(f1s) |
| best_thresh = thresholds[best_idx] if best_idx < len(thresholds) else 0.5 |
| val_auc = roc_auc_score(y_val, val_probs) |
| print(f"LightGBM val F1: {f1s[best_idx]:.4f}, AUC: {val_auc:.4f}, Thresh: {best_thresh:.4f}") |
|
|
| |
| importances = model.feature_importances_ |
| for name, imp in sorted(zip(feature_names, importances), key=lambda x: -x[1])[:10]: |
| print(f" {name}: {imp:.4f}") |
|
|
| |
| print("\nPredicting test set...") |
| test_arr = np.array(test_edges, dtype=np.int64) |
| X_test = compute_features(test_arr, batch_size=50000) |
| test_probs = model.predict_proba(X_test)[:, 1] |
|
|
| |
| import joblib |
| joblib.dump(model, '/home/lzc/lgb_model.pkl') |
|
|
| |
| train_set_full = set(map(tuple, train_edges)) |
| overlap = train_set_full & set(map(tuple, test_edges)) |
| known_mask = np.array([tuple(p) in overlap for p in test_edges]) |
|
|
| |
| np.save('/home/lzc/test_lgb_scores.npy', test_probs) |
| np.save('/home/lzc/test_known_mask.npy', known_mask) |
|
|
| |
| for thresh in np.arange(0.30, 0.71, 0.05): |
| preds = (test_probs >= thresh).astype(int) |
| path = f'/home/lzc/sub_lgb_t{thresh:.2f}.csv' |
| data_out = [[idx, str(int(p))] for idx, p in enumerate(preds)] |
| pd.DataFrame(data_out, columns=['Index', 'Predicted'], dtype=object).to_csv(path, index=False) |
| print(f" t={thresh:.2f}: pos={preds.mean():.4f}") |
|
|
| print("\nLightGBM model and predictions saved!") |
|
|