| """Dynamic-split pair feature model for author-paper recommendation. |
| |
| This follows the notebook-style split on every run, then trains a stronger |
| pair-level LightGBM model using graph, content, coauthor, citation, and optional |
| GNN score features. |
| """ |
|
|
| from __future__ import annotations |
|
|
| import argparse |
| import importlib.util |
| import pickle as pkl |
| from pathlib import Path |
|
|
| import lightgbm as lgb |
| import numpy as np |
| import pandas as pd |
| from sklearn.metrics import precision_recall_curve, roc_auc_score |
|
|
|
|
| def load_train_module(path: Path): |
| spec = importlib.util.spec_from_file_location("train_val_lgcn_ensemble", path) |
| module = importlib.util.module_from_spec(spec) |
| assert spec.loader is not None |
| spec.loader.exec_module(module) |
| return module |
|
|
|
|
| def read_txt(path: Path): |
| return [list(map(int, line.strip().split())) for line in path.open()] |
|
|
|
|
| def best_f1(y, s): |
| p, r, t = precision_recall_curve(y, s) |
| f = 2 * p * r / (p + r + 1e-12) |
| i = int(np.argmax(f)) |
| return float(f[i]), float(t[i] if i < len(t) else 0.5), float(roc_auc_score(y, s)) |
|
|
|
|
| def rank01(x): |
| order = np.argsort(x, kind="mergesort") |
| out = np.empty(len(x), dtype=np.float32) |
| out[order] = np.linspace(0, 1, len(x), dtype=np.float32) |
| return out |
|
|
|
|
| class FeatureBuilder: |
| def __init__(self, root: Path, train_refs: pd.DataFrame): |
| self.root = root |
| data_dir = root / "data_and_docs" |
| self.train = train_refs[["source", "target"]].to_numpy(np.int64) |
| self.citation = np.array(read_txt(data_dir / "paper_file_ann.txt"), dtype=np.int64) |
| self.coauthor = np.array(read_txt(data_dir / "author_file_ann.txt"), dtype=np.int64) |
| with (data_dir / "feature.pkl").open("rb") as f: |
| feat = pkl.load(f).numpy().astype(np.float32) |
| feat = feat / (np.linalg.norm(feat, axis=1, keepdims=True) + 1e-8) |
| self.paper_feat = feat |
| self.n_author = 6611 |
| self.n_paper = 79937 |
|
|
| self.author_deg = np.zeros(self.n_author, np.float32) |
| self.paper_deg = np.zeros(self.n_paper, np.float32) |
| for a, p in self.train: |
| self.author_deg[a] += 1 |
| self.paper_deg[p] += 1 |
| self.cite_out = np.zeros(self.n_paper, np.float32) |
| self.cite_in = np.zeros(self.n_paper, np.float32) |
| for s, t in self.citation: |
| self.cite_out[s] += 1 |
| self.cite_in[t] += 1 |
|
|
| self.author_papers = [[] for _ in range(self.n_author)] |
| for a, p in self.train: |
| self.author_papers[a].append(p) |
| self.author_profile = np.zeros((self.n_author, feat.shape[1]), np.float32) |
| self.author_max_pop = np.zeros(self.n_author, np.float32) |
| self.author_mean_pop = np.zeros(self.n_author, np.float32) |
| for a, papers in enumerate(self.author_papers): |
| if papers: |
| pf = feat[np.array(papers)] |
| self.author_profile[a] = pf.mean(axis=0) |
| n = np.linalg.norm(self.author_profile[a]) |
| if n > 0: |
| self.author_profile[a] /= n |
| pops = self.paper_deg[np.array(papers)] |
| self.author_max_pop[a] = pops.max() |
| self.author_mean_pop[a] = pops.mean() |
|
|
| self.train_set = set(map(tuple, self.train.tolist())) |
| self.coauthors = [set() for _ in range(self.n_author)] |
| for a, b in self.coauthor: |
| self.coauthors[a].add(b) |
| self.coauthors[b].add(a) |
| self.coauthor_read = [set() for _ in range(self.n_author)] |
| for a in range(self.n_author): |
| s = set() |
| for c in self.coauthors[a]: |
| s.update(self.author_papers[c]) |
| self.coauthor_read[a] = s |
|
|
| self.cites = [set() for _ in range(self.n_paper)] |
| self.cited_by = [set() for _ in range(self.n_paper)] |
| for s, t in self.citation: |
| self.cites[s].add(t) |
| self.cited_by[t].add(s) |
|
|
| def sample_train_pairs(self, n_pos: int, neg_per_pos: int, seed: int, forbidden_pairs: set[tuple[int, int]] | None = None): |
| rng = np.random.default_rng(seed) |
| pos_idx = rng.choice(len(self.train), size=min(n_pos, len(self.train)), replace=False) |
| pos = self.train[pos_idx] |
| neg = [] |
| authors = pos[:, 0] |
| forbidden = self.train_set if forbidden_pairs is None else forbidden_pairs |
| popular = np.flatnonzero(self.paper_deg >= np.percentile(self.paper_deg[self.paper_deg > 0], 70)) |
| while len(neg) < len(pos) * neg_per_pos: |
| a = int(authors[len(neg) % len(authors)]) |
| if rng.random() < 0.35 and self.coauthor_read[a]: |
| p = int(rng.choice(list(self.coauthor_read[a]))) |
| elif rng.random() < 0.70: |
| |
| p = int(rng.choice(popular)) |
| else: |
| p = int(rng.integers(0, self.n_paper)) |
| if (a, p) not in forbidden: |
| neg.append((a, p)) |
| X_pairs = np.vstack([pos, np.array(neg, dtype=np.int64)]) |
| y = np.concatenate([np.ones(len(pos), np.int8), np.zeros(len(neg), np.int8)]) |
| return X_pairs, y |
|
|
| def sample_task_pairs( |
| self, |
| positives: np.ndarray, |
| n_pos: int, |
| neg_per_pos: int, |
| seed: int, |
| forbidden_pairs: set[tuple[int, int]], |
| ): |
| rng = np.random.default_rng(seed) |
| pos_idx = rng.choice(len(positives), size=min(n_pos, len(positives)), replace=False) |
| pos = positives[pos_idx].astype(np.int64, copy=False) |
| neg = [] |
| authors = pos[:, 0] |
| positive_deg_papers = np.flatnonzero(self.paper_deg > 0) |
| if len(positive_deg_papers) == 0: |
| positive_deg_papers = np.arange(self.n_paper) |
| popular_cut = np.percentile(self.paper_deg[positive_deg_papers], 70) |
| popular = np.flatnonzero(self.paper_deg >= popular_cut) |
| while len(neg) < len(pos) * neg_per_pos: |
| a = int(authors[len(neg) % len(authors)]) |
| r = rng.random() |
| if r < 0.45 and self.coauthor_read[a]: |
| p = int(rng.choice(list(self.coauthor_read[a]))) |
| elif r < 0.85 and len(popular): |
| p = int(rng.choice(popular)) |
| else: |
| p = int(rng.integers(0, self.n_paper)) |
| if (a, p) not in forbidden_pairs: |
| neg.append((a, p)) |
| X_pairs = np.vstack([pos, np.array(neg, dtype=np.int64)]) |
| y = np.concatenate([np.ones(len(pos), np.int8), np.zeros(len(neg), np.int8)]) |
| return X_pairs, y |
|
|
| def transform(self, pairs: np.ndarray): |
| n = len(pairs) |
| out = np.zeros((n, 22), dtype=np.float32) |
| for i, (a, p) in enumerate(pairs): |
| papers = self.author_papers[a] |
| out[i, 0] = np.log1p(self.author_deg[a]) |
| out[i, 1] = np.log1p(self.paper_deg[p]) |
| out[i, 2] = np.log1p(self.cite_in[p]) |
| out[i, 3] = np.log1p(self.cite_out[p]) |
| out[i, 4] = np.log1p(len(self.coauthors[a])) |
| out[i, 5] = self.paper_deg[p] / (self.author_mean_pop[a] + 1.0) |
| out[i, 6] = self.paper_deg[p] / (self.author_max_pop[a] + 1.0) |
| out[i, 7] = float(p in self.coauthor_read[a]) |
| out[i, 8] = np.log1p(sum(1 for c in self.coauthors[a] if p in self.author_papers[c])) |
| out[i, 9] = float((a, p) in self.train_set) |
| out[i, 10] = float(self.author_profile[a].dot(self.paper_feat[p])) |
| if papers: |
| arr = np.array(papers, dtype=np.int64) |
| sims = self.paper_feat[arr] @ self.paper_feat[p] |
| out[i, 11] = float(sims.max()) |
| out[i, 12] = float(sims.mean()) |
| out[i, 13] = float(np.percentile(sims, 90)) |
| |
| cand_cites = self.cites[p] |
| cand_cited_by = self.cited_by[p] |
| hist = set(papers) |
| out[i, 14] = np.log1p(len(cand_cites & hist)) |
| out[i, 15] = np.log1p(len(cand_cited_by & hist)) |
| total_neighbors = set() |
| for hp in papers[:80]: |
| total_neighbors.update(self.cites[hp]) |
| total_neighbors.update(self.cited_by[hp]) |
| out[i, 16] = float(p in total_neighbors) |
| out[i, 17] = np.log1p(len(total_neighbors & cand_cites)) |
| out[i, 18] = np.log1p(len(total_neighbors & cand_cited_by)) |
| out[i, 19] = np.log1p(len(papers)) |
| out[i, 20] = self.cite_in[p] / (self.paper_deg[p] + 1.0) |
| out[i, 21] = self.cite_out[p] / (self.paper_deg[p] + 1.0) |
| return out |
|
|
|
|
| def main(): |
| parser = argparse.ArgumentParser() |
| parser.add_argument("--package-root", type=Path, default=Path(__file__).resolve().parents[1]) |
| parser.add_argument("--split-seed", type=int, required=True) |
| parser.add_argument("--train-frac", type=float, default=0.9) |
| parser.add_argument("--n-pos", type=int, default=250000) |
| parser.add_argument("--neg-per-pos", type=int, default=3) |
| parser.add_argument("--inner-holdout-frac", type=float, default=0.12) |
| parser.add_argument("--gnn-run", action="append", default=[]) |
| args = parser.parse_args() |
|
|
| root = args.package_root |
| tv = load_train_module(root / "code" / "train_val_lgcn_ensemble.py") |
| train_refs, val_pairs = tv.make_notebook_style_split(root, args.split_seed, args.train_frac) |
| rng = np.random.default_rng(args.split_seed + 17001) |
| mask = rng.random(len(train_refs)) >= args.inner_holdout_frac |
| support_refs = train_refs.loc[mask].copy() |
| pseudo_pos = train_refs.loc[~mask, ["source", "target"]].to_numpy(np.int64) |
| all_train_pairs = set(map(tuple, train_refs[["source", "target"]].to_numpy(np.int64).tolist())) |
|
|
| train_fb = FeatureBuilder(root, support_refs) |
| eval_fb = FeatureBuilder(root, train_refs) |
| train_pairs, y_train = train_fb.sample_task_pairs( |
| pseudo_pos, |
| args.n_pos, |
| args.neg_per_pos, |
| args.split_seed, |
| all_train_pairs, |
| ) |
| val_arr = val_pairs[["source", "target"]].to_numpy(np.int64) |
| y_val = val_pairs["label"].to_numpy(np.int8) |
|
|
| print("computing train features", train_pairs.shape) |
| X_train = train_fb.transform(train_pairs) |
| print("computing val features", val_arr.shape) |
| X_val = eval_fb.transform(val_arr) |
|
|
| |
| for run in args.gnn_run: |
| score_dir = root / "validation_runs" / f"dynamic_seed{args.split_seed}" / run / "scores" |
| cols = sorted(score_dir.glob("val_*.npy")) |
| for c in cols: |
| s = np.load(c).astype(np.float32) |
| if len(s) == len(y_val): |
| X_val = np.column_stack([X_val, s, rank01(s)]) |
| |
| |
| X_train = np.column_stack([X_train, np.zeros(len(X_train), np.float32), np.zeros(len(X_train), np.float32)]) |
|
|
| clf = lgb.LGBMClassifier( |
| n_estimators=1200, |
| learning_rate=0.025, |
| num_leaves=63, |
| max_depth=-1, |
| subsample=0.85, |
| colsample_bytree=0.85, |
| reg_lambda=3.0, |
| min_child_samples=50, |
| objective="binary", |
| verbose=-1, |
| ) |
| clf.fit( |
| X_train, |
| y_train, |
| eval_set=[(X_val, y_val)], |
| eval_metric="binary_logloss", |
| callbacks=[lgb.early_stopping(80, verbose=False)], |
| ) |
| pred = clf.predict_proba(X_val)[:, 1].astype(np.float32) |
| f1, th, auc = best_f1(y_val, pred) |
| out = root / "validation_runs" / f"dynamic_seed{args.split_seed}" / "feature_fusion" |
| out.mkdir(parents=True, exist_ok=True) |
| np.save(out / "val_feature_lgb.npy", pred) |
| pd.DataFrame([{"f1": f1, "threshold": th, "auc": auc, "best_iter": clf.best_iteration_}]).to_csv(out / "result.csv", index=False) |
| print(f"Feature LGB: f1={f1:.6f} th={th:.6f} auc={auc:.6f} best_iter={clf.best_iteration_}") |
|
|
|
|
| if __name__ == "__main__": |
| main() |
|
|