"""Generate test submissions for the post-0.95 stacked ensemble.""" from __future__ import annotations import argparse import importlib.util import pickle as pkl import re from pathlib import Path import lightgbm as lgb import numpy as np import pandas as pd import torch def load_module(name: str, path: Path): spec = importlib.util.spec_from_file_location(name, path) module = importlib.util.module_from_spec(spec) assert spec.loader is not None spec.loader.exec_module(module) return module def read_txt(path: Path) -> list[list[int]]: return [list(map(int, line.strip().split())) for line in path.open()] def infer_layers(path: Path, state: dict) -> int: if "layer_weight" in state: return int(state["layer_weight"].shape[0] - 1) text = f"{path.parent.parent.name}_{path.name}" match = re.search(r"_l(\d+)d", text) if match: return int(match.group(1)) match = re.search(r"L(\d+)", text) if match: return int(match.group(1)) return 4 def infer_mode(score_path: Path) -> str: name = score_path.name if "_dot_" in name: return "dot" if "_neg_l2_" in name: return "neg_l2" return "cos" def score_cache_path(root: Path, split_seed: int, val_score_path: Path) -> Path: val_score_path = val_score_path.resolve() rel = val_score_path.relative_to(root / "validation_runs" / f"dynamic_seed{split_seed}") name = rel.name.replace("val_", "test_", 1) return root / "validation_runs" / f"dynamic_seed{split_seed}" / "post95_test_scores" / rel.parent / name def checkpoint_for_score(score_path: Path) -> Path: score_path = score_path.resolve() stem = score_path.stem.replace("val_", "", 1) if stem.endswith("_ensemble_mean"): raise ValueError("ensemble scores do not map to a single checkpoint") parts = stem.split("_") variant = parts[0] seed = next(p for p in parts if p.startswith("s") and p[1:].isdigit()) dim = next(p for p in parts if p.startswith("d") and p[1:].isdigit()) return score_path.parent.parent / "checkpoints" / f"{variant}_val_{seed}_{dim}.pt" def ensemble_member_scores(score_path: Path) -> list[Path]: score_path = score_path.resolve() result_path = score_path.parent.parent / "ensemble_result.txt" text = result_path.read_text().splitlines() models_line = next(line for line in text if line.startswith("models=")) stems = [x.strip() for x in models_line.split("=", 1)[1].split(",") if x.strip()] return [score_path.parent / f"{stem}.npy" for stem in stems] @torch.no_grad() def score_checkpoint_on_test( root: Path, split_seed: int, module, parts, data_cache: dict, test_pairs: np.ndarray, val_score_path: Path, device: str, batch_size: int, ) -> np.ndarray: out_path = score_cache_path(root, split_seed, val_score_path) if out_path.exists(): return np.load(out_path) out_path.parent.mkdir(parents=True, exist_ok=True) ckpt_path = checkpoint_for_score(val_score_path) state = torch.load(ckpt_path, map_location=device) embed_dim = state["author_emb.weight"].shape[1] layers = infer_layers(ckpt_path, state) variant = "learnw" if "learnw" in ckpt_path.name else "vanilla" run_name = ckpt_path.parent.parent.name use_citation = "no_cite" not in run_name and "author_paper_only" not in run_name use_coauthor = "no_coauthor" not in run_name and "author_paper_only" not in run_name data_key = (use_citation, use_coauthor) if data_key not in data_cache: data_cache[data_key] = module.build_data( parts, 6611, 79937, torch.device(device), use_citation=use_citation, use_coauthor=use_coauthor, ) model_cls = module.LearnableWeightLightGCN if variant == "learnw" else module.LightGCN model = model_cls(6611, parts["paper_feat_aug"].shape[1], embed_dim, layers).to(torch.device(device)) model.load_state_dict(state) scores = module.predict_scores( model, data_cache[data_key], test_pairs, batch_size, mode=infer_mode(val_score_path), normalize_embeddings=False, ).astype(np.float32) np.save(out_path, scores) del model if torch.cuda.is_available(): torch.cuda.empty_cache() print(f"saved {out_path}") return scores def score_val_path_on_test( root: Path, split_seed: int, module, parts, data_cache: dict, test_pairs: np.ndarray, val_score_path: Path, device: str, batch_size: int, ) -> np.ndarray: out_path = score_cache_path(root, split_seed, val_score_path) if out_path.exists(): return np.load(out_path) if val_score_path.name.endswith("_ensemble_mean.npy"): members = [ score_val_path_on_test(root, split_seed, module, parts, data_cache, test_pairs, p, device, batch_size) for p in ensemble_member_scores(val_score_path) ] out_path.parent.mkdir(parents=True, exist_ok=True) scores = np.mean(members, axis=0).astype(np.float32) np.save(out_path, scores) print(f"saved {out_path}") return scores return score_checkpoint_on_test(root, split_seed, module, parts, data_cache, test_pairs, val_score_path, device, batch_size) def select_variant_val_scores(post95, root: Path, split_seed: int, y: np.ndarray, max_cols: int) -> list[Path]: files = sorted((root / "validation_runs" / f"dynamic_seed{split_seed}").glob("dyn*/scores/val_*.npy")) rows = [] for path in files: if "hgt" in str(path) or "sage" in str(path) or "bce" in str(path) or "norm" in str(path) or "hinge" in str(path): continue scores = np.load(path).astype(np.float32) if len(scores) != len(y) or np.std(scores) < 1e-8: continue f1, _, auc, _, _ = post95.best_f1(y, scores) rows.append((f1, auc, path)) rows.sort(key=lambda r: r[0], reverse=True) return [p for _, _, p in rows[:max_cols]] def variant_feature_matrix(post95, raw_scores: list[np.ndarray]) -> np.ndarray: if not raw_scores: return np.zeros((0, 0), dtype=np.float32) cols = [] for scores in raw_scores: cols.extend([post95.zscore(scores), post95.rank01(scores)]) raw = np.vstack(raw_scores) cols.extend([post95.zscore(raw.mean(axis=0)), post95.zscore(raw.std(axis=0)), post95.rank01(raw.mean(axis=0))]) return np.column_stack(cols).astype(np.float32) def topk_content_similarity_fast(root: Path, pairs: np.ndarray, builder) -> np.ndarray: cache = root / "validation_runs" / "feature_cache" cache.mkdir(parents=True, exist_ok=True) key = f"topk_content_{len(pairs)}_{int(pairs[:,0].sum())}_{int(pairs[:,1].sum())}.npy" path = cache / key if path.exists(): return np.load(path) with (root / "data_and_docs" / "feature.pkl").open("rb") as f: feat = pkl.load(f).numpy().astype(np.float32) feat /= np.linalg.norm(feat, axis=1, keepdims=True) + 1e-8 out = np.zeros((len(pairs), 3), dtype=np.float32) order = np.argsort(pairs[:, 0], kind="mergesort") authors = pairs[order, 0] boundaries = np.r_[0, np.flatnonzero(authors[1:] != authors[:-1]) + 1, len(order)] for lo, hi in zip(boundaries[:-1], boundaries[1:]): idx = order[lo:hi] author = int(pairs[idx[0], 0]) hist = np.asarray(list(builder.author_papers[author]), dtype=np.int64) if len(hist) == 0: continue cand = pairs[idx, 1].astype(np.int64) sims = feat[cand] @ feat[hist].T out[idx, 0] = sims.max(axis=1) for col, k in [(1, 3), (2, 5)]: kk = min(k, sims.shape[1]) top = np.partition(sims, -kk, axis=1)[:, -kk:] out[idx, col] = top.mean(axis=1) np.save(path, out) return out def make_submissions(root: Path, out_dir: Path, pred_score: np.ndarray, ratios: list[float]) -> None: known = np.load(root / "cached_scores" / "test_known_mask.npy").astype(bool) for ratio in ratios: n_pos = int(round(len(pred_score) * ratio)) pred = np.zeros(len(pred_score), dtype=np.int8) pred[np.argsort(pred_score)[-n_pos:]] = 1 pred[known] = 1 sub = pd.DataFrame({"Index": np.arange(len(pred), dtype=np.int64), "Predicted": pred}) path = out_dir / f"submission_post95_ens_r{ratio:.3f}.csv" sub.to_csv(path, index=False) print(f"{path} positives={int(pred.sum())} ratio={pred.mean():.6f}") def main() -> None: parser = argparse.ArgumentParser() parser.add_argument("--package-root", type=Path, default=Path(__file__).resolve().parents[1]) parser.add_argument("--split-seed", type=int, default=202) parser.add_argument("--main-val-score-file", type=Path, required=True) parser.add_argument("--device", default="cuda:0" if torch.cuda.is_available() else "cpu") parser.add_argument("--batch-size", type=int, default=131072) parser.add_argument("--max-variant-cols", type=int, default=20) parser.add_argument("--seed", type=int, default=202) parser.add_argument("--ratios", nargs="*", type=float, default=[0.498, 0.500, 0.502, 0.504, 0.505]) args = parser.parse_args() root = args.package_root args.main_val_score_file = args.main_val_score_file.resolve() stack_mod = load_module("stack_rank_calibration", root / "code" / "stack_rank_calibration.py") lgcn_mod = load_module("train_val_lgcn_ensemble", root / "code" / "train_val_lgcn_ensemble.py") post95 = load_module("post95_ablation", root / "code" / "post95_ablation.py") out_dir = root / "validation_runs" / f"dynamic_seed{args.split_seed}" / "post95_submission" out_dir.mkdir(parents=True, exist_ok=True) train_refs, val_pairs = lgcn_mod.make_notebook_style_split(root, args.split_seed, 0.9) val_pairs_arr = val_pairs[["source", "target"]].to_numpy(np.int64) y = val_pairs["label"].to_numpy(np.int8) main_val_score = np.load(args.main_val_score_file).astype(np.float32) print("building validation features") val_builder = stack_mod.ExplicitGraphFeatures(root, train_refs) X_val_hand = val_builder.transform(val_pairs_arr) X_val = np.column_stack( [ stack_mod.add_rank_features(val_pairs_arr, main_val_score), X_val_hand, post95.negative_evidence_features(X_val_hand, main_val_score), topk_content_similarity_fast(root, val_pairs_arr, val_builder), ] ).astype(np.float32) selected_paths = select_variant_val_scores(post95, root, args.split_seed, y, args.max_variant_cols) (out_dir / "selected_variant_val_scores.txt").write_text("\n".join(str(p) for p in selected_paths) + "\n") X_val_var = variant_feature_matrix(post95, [np.load(p).astype(np.float32) for p in selected_paths]) X_val = np.column_stack([X_val, X_val_var]).astype(np.float32) print(f"validation matrix {X_val.shape}") clf = lgb.LGBMClassifier( n_estimators=1200, learning_rate=0.025, num_leaves=31, subsample=0.9, colsample_bytree=0.9, reg_lambda=5.0, min_child_samples=80, objective="binary", verbose=-1, random_state=args.seed, ) clf.fit(X_val, y) print("building test features") test_pairs = np.array(read_txt(root / "data_and_docs" / "bipartite_test_ann.txt"), dtype=np.int64) parts = lgcn_mod.build_parts(root, None, 79937, split_seed=args.split_seed, train_frac=0.9) data_cache = {} main_test_score = score_val_path_on_test( root, args.split_seed, lgcn_mod, parts, data_cache, test_pairs, args.main_val_score_file, args.device, args.batch_size, ) full_refs = pd.DataFrame(read_txt(root / "data_and_docs" / "bipartite_train_ann.txt"), columns=["source", "target"]) test_builder = stack_mod.ExplicitGraphFeatures(root, full_refs) X_test_hand = test_builder.transform(test_pairs) X_test = np.column_stack( [ stack_mod.add_rank_features(test_pairs, main_test_score), X_test_hand, post95.negative_evidence_features(X_test_hand, main_test_score), topk_content_similarity_fast(root, test_pairs, test_builder), ] ).astype(np.float32) test_variant_scores = [ score_val_path_on_test(root, args.split_seed, lgcn_mod, parts, data_cache, test_pairs, p, args.device, args.batch_size) for p in selected_paths ] X_test_var = variant_feature_matrix(post95, test_variant_scores) X_test = np.column_stack([X_test, X_test_var]).astype(np.float32) print(f"test matrix {X_test.shape}") pred_score = clf.predict_proba(X_test)[:, 1].astype(np.float32) np.save(out_dir / "test_post95_ens_pred.npy", pred_score) make_submissions(root, out_dir, pred_score, args.ratios) if __name__ == "__main__": main()