| """Error analysis and grouped calibration around the public anchor RW stacker. |
| |
| This script intentionally reuses the cached DeepWalk / Node2Vec scores instead |
| of continuing random-walk parameter search. |
| """ |
|
|
| from __future__ import annotations |
|
|
| import argparse |
| import importlib.util |
| import warnings |
| from pathlib import Path |
|
|
| import lightgbm as lgb |
| import numpy as np |
| import pandas as pd |
| from sklearn.inspection import permutation_importance |
| from sklearn.metrics import f1_score, precision_recall_curve, roc_auc_score |
| from sklearn.model_selection import StratifiedKFold, train_test_split |
|
|
|
|
| HAND_NAMES = [ |
| "author_degree", |
| "paper_degree", |
| "coauthor_count", |
| "coauthors_read_paper_count", |
| "coauthors_read_paper_ratio", |
| "paper_citation_in", |
| "paper_citation_out", |
| "hist_ref_overlap", |
| "hist_cited_by_overlap", |
| "hist_ref_jaccard", |
| "hist_cited_by_jaccard", |
| "aap_binary", |
| "aap_count", |
| "app_count", |
| "apap_count", |
| "apap_ratio", |
| "log_author_degree", |
| "log_paper_degree", |
| ] |
|
|
|
|
| def load_module(name: str, path: Path): |
| spec = importlib.util.spec_from_file_location(name, path) |
| module = importlib.util.module_from_spec(spec) |
| assert spec.loader is not None |
| spec.loader.exec_module(module) |
| return module |
|
|
|
|
| def read_txt(path: Path) -> list[list[int]]: |
| return [list(map(int, line.strip().split())) for line in path.open()] |
|
|
|
|
| def prf(y: np.ndarray, pred: np.ndarray) -> tuple[float, float, float, int, int, int]: |
| y = y.astype(np.int8) |
| pred = pred.astype(np.int8) |
| tp = int(((pred == 1) & (y == 1)).sum()) |
| fp = int(((pred == 1) & (y == 0)).sum()) |
| fn = int(((pred == 0) & (y == 1)).sum()) |
| p = tp / (tp + fp + 1e-12) |
| r = tp / (tp + fn + 1e-12) |
| f = 2 * p * r / (p + r + 1e-12) |
| return p, r, f, tp, fp, fn |
|
|
|
|
| def best_f1(y: np.ndarray, score: np.ndarray) -> tuple[float, float, float, float, float]: |
| p, r, t = precision_recall_curve(y, score) |
| f = 2 * p * r / (p + r + 1e-12) |
| i = int(np.argmax(f)) |
| th = float(t[i]) if i < len(t) else 0.5 |
| return float(f[i]), th, float(roc_auc_score(y, score)), float(p[i]), float(r[i]) |
|
|
|
|
| def score_at_threshold(y: np.ndarray, score: np.ndarray, th: float) -> dict: |
| pred = (score >= th).astype(np.int8) |
| p, r, f, tp, fp, fn = prf(y, pred) |
| return { |
| "threshold": float(th), |
| "f1": f, |
| "precision": p, |
| "recall": r, |
| "predicted_positive_ratio": float(pred.mean()), |
| "tp": tp, |
| "fp": fp, |
| "fn": fn, |
| } |
|
|
|
|
| def rank01(x: np.ndarray) -> np.ndarray: |
| order = np.argsort(x, kind="mergesort") |
| out = np.empty(len(x), dtype=np.float32) |
| out[order] = np.linspace(0.0, 1.0, len(x), dtype=np.float32) |
| return out |
|
|
|
|
| def zscore(x: np.ndarray) -> np.ndarray: |
| return ((x - x.mean()) / (x.std() + 1e-8)).astype(np.float32) |
|
|
|
|
| def author_rank01(pairs: np.ndarray, score: np.ndarray) -> np.ndarray: |
| out = np.zeros(len(score), dtype=np.float32) |
| df = pd.DataFrame({"idx": np.arange(len(score)), "author": pairs[:, 0], "score": score}) |
| for _, g in df.groupby("author", sort=False): |
| idx = g["idx"].to_numpy() |
| order = np.argsort(g["score"].to_numpy(), kind="mergesort") |
| vals = np.linspace(0, 1, len(idx), dtype=np.float32) if len(idx) > 1 else np.array([1.0], dtype=np.float32) |
| out[idx[order]] = vals |
| return out |
|
|
|
|
| def author_rank_position(pairs: np.ndarray, score: np.ndarray, descending: bool = True) -> np.ndarray: |
| out = np.zeros(len(score), dtype=np.float32) |
| df = pd.DataFrame({"idx": np.arange(len(score)), "author": pairs[:, 0], "score": score}) |
| for _, g in df.groupby("author", sort=False): |
| idx = g["idx"].to_numpy() |
| order = np.argsort(g["score"].to_numpy(), kind="mergesort") |
| if descending: |
| order = order[::-1] |
| out[idx[order]] = np.arange(1, len(idx) + 1, dtype=np.float32) |
| return out |
|
|
|
|
| def score_features(scores: np.ndarray, prefix: str, pairs: np.ndarray) -> tuple[np.ndarray, list[str]]: |
| return ( |
| np.column_stack([scores.astype(np.float32), zscore(scores), rank01(scores), author_rank01(pairs, scores)]).astype(np.float32), |
| [prefix, f"{prefix}_z", f"{prefix}_rank", f"{prefix}_author_rank"], |
| ) |
|
|
|
|
| def fit_lgb_oof( |
| X: np.ndarray, |
| y: np.ndarray, |
| seed: int, |
| n_splits: int, |
| train_mask: np.ndarray | None = None, |
| params: dict | None = None, |
| ) -> tuple[np.ndarray, list[lgb.LGBMClassifier]]: |
| oof = np.zeros(len(y), dtype=np.float32) |
| models: list[lgb.LGBMClassifier] = [] |
| base_params = dict( |
| n_estimators=1200, |
| learning_rate=0.025, |
| num_leaves=31, |
| subsample=0.9, |
| colsample_bytree=0.9, |
| reg_lambda=5.0, |
| min_child_samples=80, |
| objective="binary", |
| verbose=-1, |
| ) |
| if params: |
| base_params.update(params) |
| eligible = np.arange(len(y)) if train_mask is None else np.flatnonzero(train_mask) |
| skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=seed) |
| for fold, (tr_loc, va_loc) in enumerate(skf.split(X[eligible], y[eligible]), start=1): |
| tr = eligible[tr_loc] |
| va = eligible[va_loc] |
| clf = lgb.LGBMClassifier(**base_params, random_state=seed + fold) |
| clf.fit(X[tr], y[tr]) |
| oof[va] = clf.predict_proba(X[va])[:, 1] |
| models.append(clf) |
| return oof, models |
|
|
|
|
| def fit_full_lgb(X: np.ndarray, y: np.ndarray, seed: int, params: dict | None = None) -> lgb.LGBMClassifier: |
| base_params = dict( |
| n_estimators=1200, |
| learning_rate=0.025, |
| num_leaves=31, |
| subsample=0.9, |
| colsample_bytree=0.9, |
| reg_lambda=5.0, |
| min_child_samples=80, |
| objective="binary", |
| verbose=-1, |
| random_state=seed, |
| ) |
| if params: |
| base_params.update(params) |
| clf = lgb.LGBMClassifier(**base_params) |
| clf.fit(X, y) |
| return clf |
|
|
|
|
| def average_predict(models: list[lgb.LGBMClassifier], X: np.ndarray) -> np.ndarray: |
| pred = np.zeros(X.shape[0], dtype=np.float32) |
| for m in models: |
| pred += m.predict_proba(X)[:, 1].astype(np.float32) |
| pred /= max(1, len(models)) |
| return pred |
|
|
|
|
| def bucket_cut(values: np.ndarray, name: str, bins: list[float]) -> np.ndarray: |
| labels = [] |
| for lo, hi in zip(bins[:-1], bins[1:]): |
| left = "-inf" if np.isneginf(lo) else f"{lo:g}" |
| right = "inf" if np.isposinf(hi) else f"{hi:g}" |
| labels.append(f"{name}[{left},{right})") |
| return np.asarray(pd.cut(values, bins=bins, labels=labels, include_lowest=True, right=False).astype(str)) |
|
|
|
|
| def bucket_quantile(values: np.ndarray, name: str, q: int = 10) -> np.ndarray: |
| return np.asarray(pd.qcut(values, q=q, duplicates="drop").astype(str)) |
|
|
|
|
| def load_submission_pred(path: Path) -> np.ndarray: |
| return pd.read_csv(path)["Predicted"].to_numpy(np.int8) |
|
|
|
|
| def write_submission(path: Path, score: np.ndarray, known: np.ndarray, threshold: float | None = None, ratio: float | None = None) -> np.ndarray: |
| if threshold is None and ratio is None: |
| raise ValueError("threshold or ratio is required") |
| if threshold is not None: |
| pred = (score >= threshold).astype(np.int8) |
| else: |
| pred = np.zeros(len(score), dtype=np.int8) |
| pred[np.argsort(score, kind="mergesort")[-int(round(len(score) * float(ratio))):]] = 1 |
| pred[known] = 1 |
| pd.DataFrame({"Index": np.arange(len(pred), dtype=np.int64), "Predicted": pred}).to_csv(path, index=False) |
| return pred |
|
|
|
|
| def write_group_submission( |
| path: Path, |
| score: np.ndarray, |
| groups: np.ndarray, |
| thresholds: dict[str, float], |
| known: np.ndarray, |
| ) -> np.ndarray: |
| pred = np.zeros(len(score), dtype=np.int8) |
| for g, th in thresholds.items(): |
| mask = groups == g |
| pred[mask] = (score[mask] >= th).astype(np.int8) |
| pred[known] = 1 |
| pd.DataFrame({"Index": np.arange(len(pred), dtype=np.int64), "Predicted": pred}).to_csv(path, index=False) |
| return pred |
|
|
|
|
| def group_threshold(y: np.ndarray, score: np.ndarray, groups: np.ndarray) -> tuple[dict[str, float], np.ndarray, dict]: |
| pred = np.zeros(len(score), dtype=np.int8) |
| thresholds: dict[str, float] = {} |
| rows = [] |
| for g in pd.Series(groups).dropna().unique(): |
| g = str(g) |
| mask = groups == g |
| if int(mask.sum()) < 20 or len(np.unique(y[mask])) < 2: |
| _, th, _, _, _ = best_f1(y, score) |
| else: |
| _, th, _, _, _ = best_f1(y[mask], score[mask]) |
| thresholds[g] = float(th) |
| pred[mask] = (score[mask] >= th).astype(np.int8) |
| p, r, f, tp, fp, fn = prf(y[mask], pred[mask]) |
| rows.append({"group": g, "threshold": th, "n": int(mask.sum()), "f1": f, "precision": p, "recall": r, "tp": tp, "fp": fp, "fn": fn}) |
| p, r, f, tp, fp, fn = prf(y, pred) |
| metrics = {"f1": f, "precision": p, "recall": r, "predicted_positive_ratio": float(pred.mean()), "tp": tp, "fp": fp, "fn": fn, "group_rows": rows} |
| return thresholds, pred, metrics |
|
|
|
|
| def summarize_pred(name: str, y: np.ndarray, score: np.ndarray, pred: np.ndarray, path: Path | None, anchor_pred: np.ndarray | None) -> dict: |
| p, r, f, tp, fp, fn = prf(y, pred) |
| return { |
| "experiment": name, |
| "validation_f1": f, |
| "precision": p, |
| "recall": r, |
| "predicted_positive_ratio": float(pred.mean()), |
| "tp": tp, |
| "fp": fp, |
| "fn": fn, |
| "public_submission_path": "" if path is None else str(path), |
| "changed_predictions_vs_current_best": "" if anchor_pred is None or path is None else int((load_submission_pred(path) != anchor_pred).sum()), |
| } |
|
|
|
|
| def build_feature_names(selected: list[Path], rw_names: list[str]) -> list[str]: |
| names = ["lgcn_score", "lgcn_global_rank", "lgcn_author_pct", "author_internal_rank"] |
| names += HAND_NAMES |
| names += [ |
| "has_local_evidence", |
| "lgcn_x_local_evidence", |
| "lgcn_x_no_local_evidence", |
| "lgcn_div_log_paper_degree", |
| "paper_degree_pct", |
| "paper_degree_x_ref_overlap", |
| "paper_degree_x_cited_overlap", |
| "paper_degree_x_app_count", |
| ] |
| names += ["top1_content_sim", "top3_content_sim", "top5_content_sim"] |
| for p in selected: |
| stem = p.parent.parent.name + "__" + p.stem |
| names += [f"{stem}_z", f"{stem}_rank"] |
| names += ["lgcn_variant_mean_z", "lgcn_variant_std_z", "lgcn_variant_mean_rank"] |
| names += ["content_mean_cos", "content_mean_cos_z", "content_mean_cos_rank", "content_mean_cos_author_rank"] |
| names += ["mf_bpr", "mf_bpr_z", "mf_bpr_rank", "mf_bpr_author_rank"] |
| names += rw_names |
| return names |
|
|
|
|
| def build_val_test_context(root: Path, split_seed: int, main_val_score_file: Path): |
| root = root.resolve() |
| stack = load_module("stack", root / "code" / "stack_rank_calibration.py") |
| lgcn = load_module("lgcn", root / "code" / "train_val_lgcn_ensemble.py") |
| post = load_module("post", root / "code" / "post95_ablation.py") |
| gen = load_module("gen", root / "code" / "generate_post95_submission.py") |
| extra = load_module("extra", root / "code" / "extra_score_sources_ablation.py") |
|
|
| train_refs, val_pairs = lgcn.make_notebook_style_split(root, split_seed, 0.9) |
| val_arr = val_pairs[["source", "target"]].to_numpy(np.int64) |
| y = val_pairs["label"].to_numpy(np.int8) |
| main_val = np.load(main_val_score_file).astype(np.float32) |
| val_builder = stack.ExplicitGraphFeatures(root, train_refs) |
| Xh_val = val_builder.transform(val_arr) |
|
|
| selected = [Path(x.strip()) for x in (root / "validation_runs" / f"dynamic_seed{split_seed}" / "post95_submission" / "selected_variant_val_scores.txt").read_text().splitlines() if x.strip()] |
| X_val = np.column_stack( |
| [ |
| stack.add_rank_features(val_arr, main_val), |
| Xh_val, |
| post.negative_evidence_features(Xh_val, main_val), |
| gen.topk_content_similarity_fast(root, val_arr, val_builder), |
| gen.variant_feature_matrix(post, [np.load(p).astype(np.float32) for p in selected]), |
| ] |
| ).astype(np.float32) |
| content_val = extra.content_mean_score(root, val_arr, val_builder) |
| mf_val = np.load(root / "validation_runs" / f"dynamic_seed{split_seed}" / "extra_score_sources" / f"val_mf_bpr_s{split_seed}_d256.npy").astype(np.float32) |
| Xc_val, _ = score_features(content_val, "content_mean_cos", val_arr) |
| Xm_val, _ = score_features(mf_val, "mf_bpr", val_arr) |
| X_val = np.column_stack([X_val, Xc_val, Xm_val]).astype(np.float32) |
|
|
| base_dir = root / "validation_runs" / f"dynamic_seed{split_seed}" / "node2vec_deepwalk" |
| rw_raw_val = {} |
| rw_names = [] |
| for name in ["deepwalk", "node2vec"]: |
| cos = np.load(base_dir / f"{name}_cos_{len(val_arr)}_{int(val_arr[:,0].sum())}_{int(val_arr[:,1].sum())}.npy").astype(np.float32) |
| dot = np.load(base_dir / f"{name}_dot_{len(val_arr)}_{int(val_arr[:,0].sum())}_{int(val_arr[:,1].sum())}.npy").astype(np.float32) |
| rw_raw_val[f"{name}_cos"] = cos |
| rw_raw_val[f"{name}_dot"] = dot |
| Xcos, names = score_features(cos, f"{name}_cos", val_arr) |
| Xdot, names2 = score_features(dot, f"{name}_dot", val_arr) |
| rw_names += names + names2 |
| X_val = np.column_stack([X_val, Xcos, Xdot]).astype(np.float32) |
|
|
| test_arr = np.array(read_txt(root / "data_and_docs" / "bipartite_test_ann.txt"), dtype=np.int64) |
| full_refs = pd.DataFrame(read_txt(root / "data_and_docs" / "bipartite_train_ann.txt"), columns=["source", "target"]) |
| test_builder = stack.ExplicitGraphFeatures(root, full_refs) |
| main_test = np.load(root / "validation_runs" / f"dynamic_seed{split_seed}" / "post95_test_scores" / "dyn202_l2d512_bpr_bigbatch_more" / "scores" / "test_vanilla_ensemble_mean.npy").astype(np.float32) |
| Xh_test = test_builder.transform(test_arr) |
| X_test = np.column_stack( |
| [ |
| stack.add_rank_features(test_arr, main_test), |
| Xh_test, |
| post.negative_evidence_features(Xh_test, main_test), |
| gen.topk_content_similarity_fast(root, test_arr, test_builder), |
| ] |
| ).astype(np.float32) |
| test_variant_scores = [] |
| for p in selected: |
| rel = p.resolve().relative_to(root / "validation_runs" / f"dynamic_seed{split_seed}") |
| tp = root / "validation_runs" / f"dynamic_seed{split_seed}" / "post95_test_scores" / rel.parent / rel.name.replace("val_", "test_", 1) |
| test_variant_scores.append(np.load(tp).astype(np.float32)) |
| X_test = np.column_stack([X_test, gen.variant_feature_matrix(post, test_variant_scores)]).astype(np.float32) |
| content_test = extra.content_mean_score(root, test_arr, test_builder) |
| mf_test = np.load(root / "validation_runs" / f"dynamic_seed{split_seed}" / "extra_bprmf_submission" / "test_mf_bpr_dynamic_s202_d256_e220.npy").astype(np.float32) |
| Xc_test, _ = score_features(content_test, "content_mean_cos", test_arr) |
| Xm_test, _ = score_features(mf_test, "mf_bpr", test_arr) |
| X_test = np.column_stack([X_test, Xc_test, Xm_test]).astype(np.float32) |
|
|
| rw_raw_test = {} |
| for name in ["deepwalk", "node2vec"]: |
| cos = np.load(base_dir / f"{name}_cos_{len(test_arr)}_{int(test_arr[:,0].sum())}_{int(test_arr[:,1].sum())}.npy").astype(np.float32) |
| dot = np.load(base_dir / f"{name}_dot_{len(test_arr)}_{int(test_arr[:,0].sum())}_{int(test_arr[:,1].sum())}.npy").astype(np.float32) |
| rw_raw_test[f"{name}_cos"] = cos |
| rw_raw_test[f"{name}_dot"] = dot |
| Xcos, _ = score_features(cos, f"{name}_cos", test_arr) |
| Xdot, _ = score_features(dot, f"{name}_dot", test_arr) |
| X_test = np.column_stack([X_test, Xcos, Xdot]).astype(np.float32) |
|
|
| feature_names = build_feature_names(selected, rw_names) |
| assert X_val.shape[1] == len(feature_names), (X_val.shape, len(feature_names)) |
| assert X_test.shape[1] == len(feature_names), (X_test.shape, len(feature_names)) |
|
|
| return { |
| "pairs": val_arr, |
| "y": y, |
| "test_pairs": test_arr, |
| "X": X_val, |
| "X_test": X_test, |
| "feature_names": feature_names, |
| "Xh": Xh_val, |
| "Xh_test": Xh_test, |
| "scores": { |
| "final": np.load(root / "validation_runs" / f"dynamic_seed{split_seed}" / "node2vec_deepwalk" / "node2vec_stack_oof.npy").astype(np.float32), |
| "content": content_val, |
| "mf_bpr": mf_val, |
| "lgcn": main_val, |
| "deepwalk": rw_raw_val["deepwalk_cos"], |
| "node2vec": rw_raw_val["node2vec_cos"], |
| "deepwalk_dot": rw_raw_val["deepwalk_dot"], |
| "node2vec_dot": rw_raw_val["node2vec_dot"], |
| }, |
| "test_scores": { |
| "final": np.load(root / "validation_runs" / f"dynamic_seed{split_seed}" / "node2vec_deepwalk_submission" / "test_content_mf_deepwalk_node2vec_lgb_pred.npy").astype(np.float32), |
| "content": content_test, |
| "mf_bpr": mf_test, |
| "lgcn": main_test, |
| "deepwalk": rw_raw_test["deepwalk_cos"], |
| "node2vec": rw_raw_test["node2vec_cos"], |
| "deepwalk_dot": rw_raw_test["deepwalk_dot"], |
| "node2vec_dot": rw_raw_test["node2vec_dot"], |
| }, |
| } |
|
|
|
|
| def error_analysis(ctx: dict, pred: np.ndarray, out_dir: Path) -> None: |
| y = ctx["y"] |
| Xh = ctx["Xh"] |
| scores = ctx["scores"] |
| has_local = ((Xh[:, 3] + Xh[:, 7] + Xh[:, 8] + Xh[:, 12] + Xh[:, 13] + Xh[:, 14]) > 0).astype(np.int8) |
| bucket_defs = { |
| "author_degree": bucket_cut(Xh[:, 0], "author_degree", [-np.inf, 1, 3, 8, 20, 50, np.inf]), |
| "paper_degree": bucket_cut(Xh[:, 1], "paper_degree", [-np.inf, 1, 3, 10, 30, 100, np.inf]), |
| "paper_citation_in": bucket_cut(Xh[:, 5], "paper_citation_in", [-np.inf, 1, 3, 10, 30, 100, np.inf]), |
| "final_score": bucket_quantile(scores["final"], "final_score", 10), |
| "content_score": bucket_quantile(scores["content"], "content_score", 10), |
| "BPR-MF_score": bucket_quantile(scores["mf_bpr"], "BPR-MF_score", 10), |
| "LightGCN_score": bucket_quantile(scores["lgcn"], "LightGCN_score", 10), |
| "DeepWalk_score": bucket_quantile(scores["deepwalk"], "DeepWalk_score", 10), |
| "Node2Vec_score": bucket_quantile(scores["node2vec"], "Node2Vec_score", 10), |
| "has_local_evidence": np.where(has_local > 0, "has_local_evidence=1", "has_local_evidence=0"), |
| "author_internal_rank": bucket_cut(author_rank_position(ctx["pairs"], scores["final"]), "author_internal_rank", [-np.inf, 1, 3, 5, 10, 20, 50, np.inf]), |
| } |
| rows = [] |
| for bucket_type, groups in bucket_defs.items(): |
| for g in pd.Series(groups).dropna().unique(): |
| mask = groups == g |
| p, r, f, tp, fp, fn = prf(y[mask], pred[mask]) |
| rows.append( |
| { |
| "bucket_type": bucket_type, |
| "bucket": str(g), |
| "n": int(mask.sum()), |
| "actual_positive": int(y[mask].sum()), |
| "predicted_positive": int(pred[mask].sum()), |
| "fp": fp, |
| "fn": fn, |
| "precision": p, |
| "recall": r, |
| "f1": f, |
| } |
| ) |
| pd.DataFrame(rows).to_csv(out_dir / "error_analysis_buckets.csv", index=False) |
|
|
|
|
| def add_agreement_features(ctx: dict) -> tuple[np.ndarray, np.ndarray, list[str], pd.DataFrame]: |
| pairs = ctx["pairs"] |
| test_pairs = ctx["test_pairs"] |
| y = ctx["y"] |
| train_scores = { |
| "lgcn": ctx["scores"]["lgcn"], |
| "mf_bpr": ctx["scores"]["mf_bpr"], |
| "content": ctx["scores"]["content"], |
| "deepwalk": ctx["scores"]["deepwalk"], |
| "node2vec": ctx["scores"]["node2vec"], |
| } |
| test_scores = { |
| "lgcn": ctx["test_scores"]["lgcn"], |
| "mf_bpr": ctx["test_scores"]["mf_bpr"], |
| "content": ctx["test_scores"]["content"], |
| "deepwalk": ctx["test_scores"]["deepwalk"], |
| "node2vec": ctx["test_scores"]["node2vec"], |
| } |
| thresholds = {} |
| rows = [] |
| val_votes = [] |
| test_votes = [] |
| val_ranks = {} |
| test_ranks = {} |
| for name, s in train_scores.items(): |
| f, th, auc, p, r = best_f1(y, s) |
| thresholds[name] = th |
| rows.append({"source": name, "best_f1": f, "threshold": th, "auc": auc, "precision": p, "recall": r}) |
| val_votes.append((s >= th).astype(np.float32)) |
| test_votes.append((test_scores[name] >= th).astype(np.float32)) |
| val_ranks[name] = author_rank01(pairs, s) |
| test_ranks[name] = author_rank01(test_pairs, test_scores[name]) |
| V = np.vstack(val_votes).T.astype(np.float32) |
| T = np.vstack(test_votes).T.astype(np.float32) |
| graph_val = V[:, [0, 1, 3, 4]] |
| graph_test = T[:, [0, 1, 3, 4]] |
| content_vote_val = V[:, 2] |
| content_vote_test = T[:, 2] |
| graph_vote_count_val = graph_val.sum(axis=1) |
| graph_vote_count_test = graph_test.sum(axis=1) |
| content_graph_gap_val = val_ranks["content"] - (val_ranks["lgcn"] + val_ranks["mf_bpr"] + val_ranks["deepwalk"] + val_ranks["node2vec"]) / 4.0 |
| content_graph_gap_test = test_ranks["content"] - (test_ranks["lgcn"] + test_ranks["mf_bpr"] + test_ranks["deepwalk"] + test_ranks["node2vec"]) / 4.0 |
| X = np.column_stack( |
| [ |
| V.sum(axis=1), |
| graph_vote_count_val, |
| content_vote_val - graph_vote_count_val / 4.0, |
| val_ranks["lgcn"] - val_ranks["mf_bpr"], |
| content_graph_gap_val, |
| ] |
| ).astype(np.float32) |
| Xt = np.column_stack( |
| [ |
| T.sum(axis=1), |
| graph_vote_count_test, |
| content_vote_test - graph_vote_count_test / 4.0, |
| test_ranks["lgcn"] - test_ranks["mf_bpr"], |
| content_graph_gap_test, |
| ] |
| ).astype(np.float32) |
| names = ["vote_count", "graph_vote_count", "content_vs_graph_disagreement", "lgcn_bpr_gap", "content_graph_gap"] |
| return X, Xt, names, pd.DataFrame(rows) |
|
|
|
|
| def boundary_features(ctx: dict, agreement: np.ndarray, agreement_test: np.ndarray) -> tuple[np.ndarray, np.ndarray, list[str]]: |
| pairs = ctx["pairs"] |
| test_pairs = ctx["test_pairs"] |
| scores = ctx["scores"] |
| test_scores = ctx["test_scores"] |
| Xh = ctx["Xh"] |
| Xht = ctx["Xh_test"] |
| has_local = ((Xh[:, 3] + Xh[:, 7] + Xh[:, 8] + Xh[:, 12] + Xh[:, 13] + Xh[:, 14]) > 0).astype(np.float32) |
| has_local_t = ((Xht[:, 3] + Xht[:, 7] + Xht[:, 8] + Xht[:, 12] + Xht[:, 13] + Xht[:, 14]) > 0).astype(np.float32) |
| rw_score = 0.5 * (scores["deepwalk"] + scores["node2vec"]) |
| rw_score_t = 0.5 * (test_scores["deepwalk"] + test_scores["node2vec"]) |
| X = np.column_stack( |
| [ |
| author_rank01(pairs, scores["content"]), |
| author_rank01(pairs, scores["mf_bpr"]), |
| author_rank01(pairs, scores["lgcn"]), |
| author_rank01(pairs, rw_score), |
| Xh[:, 3], |
| Xh[:, 4], |
| Xh[:, 7], |
| Xh[:, 8], |
| Xh[:, 9], |
| Xh[:, 10], |
| -rank01(Xh[:, 1]), |
| has_local, |
| agreement[:, 0], |
| agreement[:, 1], |
| ] |
| ).astype(np.float32) |
| Xt = np.column_stack( |
| [ |
| author_rank01(test_pairs, test_scores["content"]), |
| author_rank01(test_pairs, test_scores["mf_bpr"]), |
| author_rank01(test_pairs, test_scores["lgcn"]), |
| author_rank01(test_pairs, rw_score_t), |
| Xht[:, 3], |
| Xht[:, 4], |
| Xht[:, 7], |
| Xht[:, 8], |
| Xht[:, 9], |
| Xht[:, 10], |
| -rank01(Xht[:, 1]), |
| has_local_t, |
| agreement_test[:, 0], |
| agreement_test[:, 1], |
| ] |
| ).astype(np.float32) |
| names = [ |
| "content_rank", |
| "bpr_mf_rank", |
| "lgcn_rank", |
| "rw_rank", |
| "coauthor_evidence_count", |
| "coauthor_evidence_ratio", |
| "citation_ref_overlap", |
| "citation_cited_by_overlap", |
| "citation_ref_jaccard", |
| "citation_cited_by_jaccard", |
| "paper_popularity_penalty", |
| "has_local_evidence", |
| "vote_count", |
| "graph_vote_count", |
| ] |
| return X, Xt, names |
|
|
|
|
| def main() -> None: |
| warnings.filterwarnings("ignore", message="X does not have valid feature names") |
| parser = argparse.ArgumentParser() |
| parser.add_argument("--package-root", type=Path, default=Path(__file__).resolve().parents[1]) |
| parser.add_argument("--split-seed", type=int, default=202) |
| parser.add_argument("--main-val-score-file", type=Path, default=None) |
| parser.add_argument("--seed", type=int, default=202) |
| parser.add_argument("--n-splits", type=int, default=5) |
| args = parser.parse_args() |
|
|
| root = args.package_root.resolve() |
| main_val = args.main_val_score_file or root / "validation_runs" / f"dynamic_seed{args.split_seed}" / "dyn202_l2d512_bpr_bigbatch_more" / "scores" / "val_vanilla_ensemble_mean.npy" |
| out_dir = root / "validation_runs" / f"dynamic_seed{args.split_seed}" / "error_group_calibration" |
| sub_dir = out_dir / "submissions" |
| out_dir.mkdir(parents=True, exist_ok=True) |
| sub_dir.mkdir(parents=True, exist_ok=True) |
|
|
| ctx = build_val_test_context(root, args.split_seed, main_val) |
| y = ctx["y"] |
| score = ctx["scores"]["final"] |
| test_score = ctx["test_scores"]["final"] |
| known = np.load(root / "cached_scores" / "test_known_mask.npy").astype(bool) |
| anchor_path = root / "validation_runs" / f"dynamic_seed{args.split_seed}" / "node2vec_deepwalk_submission" / "submission_content_mf_deepwalk_node2vec_lgb_th0.480000.csv" |
| anchor_pred = load_submission_pred(anchor_path) |
|
|
| overall_rows = [] |
| best_anchor = score_at_threshold(y, score, 0.48) |
| anchor_val_pred = (score >= 0.48).astype(np.int8) |
| error_analysis(ctx, anchor_val_pred, out_dir) |
| overall_rows.append(summarize_pred("current_anchor_th0.480", y, score, anchor_val_pred, anchor_path, anchor_pred)) |
|
|
| threshold_rows = [] |
| for th in [0.485, 0.490, 0.495, 0.500, 0.505, 0.510, 0.515]: |
| path = sub_dir / f"submission_node2vec_deepwalk_th{th:.3f}.csv" |
| pred_test = write_submission(path, test_score, known, threshold=th) |
| row = score_at_threshold(y, score, th) |
| row["public_submission_path"] = str(path) |
| row["changed_predictions_vs_current_best"] = int((pred_test != anchor_pred).sum()) |
| row["test_predicted_positive_ratio"] = float(pred_test.mean()) |
| threshold_rows.append(row) |
| overall_rows.append( |
| { |
| "experiment": f"threshold_{th:.3f}", |
| "validation_f1": row["f1"], |
| "precision": row["precision"], |
| "recall": row["recall"], |
| "predicted_positive_ratio": row["test_predicted_positive_ratio"], |
| "tp": row["tp"], |
| "fp": row["fp"], |
| "fn": row["fn"], |
| "public_submission_path": str(path), |
| "changed_predictions_vs_current_best": row["changed_predictions_vs_current_best"], |
| } |
| ) |
| pd.DataFrame(threshold_rows).to_csv(out_dir / "threshold_sweep.csv", index=False) |
|
|
| Xh = ctx["Xh"] |
| Xht = ctx["Xh_test"] |
| has_local = ((Xh[:, 3] + Xh[:, 7] + Xh[:, 8] + Xh[:, 12] + Xh[:, 13] + Xh[:, 14]) > 0).astype(np.int8) |
| has_local_t = ((Xht[:, 3] + Xht[:, 7] + Xht[:, 8] + Xht[:, 12] + Xht[:, 13] + Xht[:, 14]) > 0).astype(np.int8) |
| group_defs = { |
| "author_degree": ( |
| bucket_cut(Xh[:, 0], "author_degree", [-np.inf, 1, 3, 8, 20, 50, np.inf]), |
| bucket_cut(Xht[:, 0], "author_degree", [-np.inf, 1, 3, 8, 20, 50, np.inf]), |
| ), |
| "paper_degree": ( |
| bucket_cut(Xh[:, 1], "paper_degree", [-np.inf, 1, 3, 10, 30, 100, np.inf]), |
| bucket_cut(Xht[:, 1], "paper_degree", [-np.inf, 1, 3, 10, 30, 100, np.inf]), |
| ), |
| "has_local_evidence": ( |
| np.where(has_local > 0, "has_local_evidence=1", "has_local_evidence=0"), |
| np.where(has_local_t > 0, "has_local_evidence=1", "has_local_evidence=0"), |
| ), |
| } |
| group_rows = [] |
| for name, (groups, test_groups) in group_defs.items(): |
| thresholds, pred, metrics = group_threshold(y, score, groups) |
| path = sub_dir / f"submission_group_threshold_{name}.csv" |
| pred_test = write_group_submission(path, test_score, test_groups, thresholds, known) |
| pd.DataFrame(metrics.pop("group_rows")).to_csv(out_dir / f"group_threshold_{name}_details.csv", index=False) |
| pd.Series(thresholds).to_csv(out_dir / f"group_threshold_{name}_thresholds.csv") |
| group_rows.append({"grouping": name, **metrics, "public_submission_path": str(path), "changed_predictions_vs_current_best": int((pred_test != anchor_pred).sum()), "test_predicted_positive_ratio": float(pred_test.mean())}) |
| overall_rows.append( |
| { |
| "experiment": f"group_threshold_{name}", |
| "validation_f1": metrics["f1"], |
| "precision": metrics["precision"], |
| "recall": metrics["recall"], |
| "predicted_positive_ratio": float(pred_test.mean()), |
| "tp": metrics["tp"], |
| "fp": metrics["fp"], |
| "fn": metrics["fn"], |
| "public_submission_path": str(path), |
| "changed_predictions_vs_current_best": int((pred_test != anchor_pred).sum()), |
| } |
| ) |
| pd.DataFrame(group_rows).to_csv(out_dir / "groupwise_calibration.csv", index=False) |
|
|
| agree_val, agree_test, agree_names, vote_thresholds = add_agreement_features(ctx) |
| vote_thresholds.to_csv(out_dir / "model_vote_thresholds.csv", index=False) |
|
|
| boundary_X, boundary_Xt, boundary_names = boundary_features(ctx, agree_val, agree_test) |
| boundary_mask = (score >= 0.45) & (score <= 0.55) |
| boundary_oof, boundary_models = fit_lgb_oof(boundary_X, y, args.seed + 51, args.n_splits, train_mask=boundary_mask, params={"n_estimators": 700, "learning_rate": 0.03, "min_child_samples": 40}) |
| boundary_pred = np.zeros(len(y), dtype=np.int8) |
| boundary_pred[score > 0.55] = 1 |
| boundary_pred[score < 0.45] = 0 |
| _, boundary_th, _, _, _ = best_f1(y[boundary_mask], boundary_oof[boundary_mask]) |
| boundary_pred[boundary_mask] = (boundary_oof[boundary_mask] >= boundary_th).astype(np.int8) |
| boundary_test_score = average_predict(boundary_models, boundary_Xt) |
| boundary_test_pred = np.zeros(len(test_score), dtype=np.int8) |
| boundary_test_pred[test_score > 0.55] = 1 |
| boundary_test_pred[test_score < 0.45] = 0 |
| boundary_test_mask = (test_score >= 0.45) & (test_score <= 0.55) |
| boundary_test_pred[boundary_test_mask] = (boundary_test_score[boundary_test_mask] >= boundary_th).astype(np.int8) |
| boundary_test_pred[known] = 1 |
| boundary_path = sub_dir / "submission_boundary_lgbm_045_055.csv" |
| pd.DataFrame({"Index": np.arange(len(boundary_test_pred), dtype=np.int64), "Predicted": boundary_test_pred}).to_csv(boundary_path, index=False) |
| pd.Series(boundary_names).to_csv(out_dir / "boundary_feature_names.csv", index=False) |
| overall_rows.append(summarize_pred("boundary_lgbm_045_055", y, score, boundary_pred, boundary_path, anchor_pred)) |
|
|
| boundary_ablation = [] |
| ablations = { |
| "boundary_all": np.arange(boundary_X.shape[1]), |
| "no_vote_features": np.array([i for i, n in enumerate(boundary_names) if "vote" not in n], dtype=int), |
| "rank_only": np.array([0, 1, 2, 3], dtype=int), |
| "evidence_only": np.array([4, 5, 6, 7, 8, 9, 10, 11], dtype=int), |
| } |
| for name, cols in ablations.items(): |
| oof, _ = fit_lgb_oof(boundary_X[:, cols], y, args.seed + 71, args.n_splits, train_mask=boundary_mask, params={"n_estimators": 500, "learning_rate": 0.035, "min_child_samples": 40}) |
| pred = np.zeros(len(y), dtype=np.int8) |
| pred[score > 0.55] = 1 |
| _, th, auc, _, _ = best_f1(y[boundary_mask], oof[boundary_mask]) |
| pred[boundary_mask] = (oof[boundary_mask] >= th).astype(np.int8) |
| p, r, f, tp, fp, fn = prf(y, pred) |
| boundary_ablation.append({"stage": name, "validation_f1": f, "boundary_threshold": th, "boundary_auc": auc, "precision": p, "recall": r, "tp": tp, "fp": fp, "fn": fn, "n_features": len(cols)}) |
| pd.DataFrame(boundary_ablation).to_csv(out_dir / "boundary_ablation.csv", index=False) |
|
|
| X_agree = np.column_stack([ctx["X"], agree_val]).astype(np.float32) |
| Xt_agree = np.column_stack([ctx["X_test"], agree_test]).astype(np.float32) |
| agree_oof, agree_models = fit_lgb_oof(X_agree, y, args.seed + 91, args.n_splits) |
| f, th, auc, p, r = best_f1(y, agree_oof) |
| agree_test_score = average_predict(agree_models, Xt_agree) |
| agree_path = sub_dir / "submission_agreement_lgbm_valbest_ratio.csv" |
| pred_test = write_submission(agree_path, agree_test_score, known, ratio=float((agree_oof >= th).mean())) |
| pred_val = (agree_oof >= th).astype(np.int8) |
| overall_rows.append( |
| { |
| "experiment": "agreement_features_lgbm", |
| "validation_f1": f, |
| "precision": p, |
| "recall": r, |
| "predicted_positive_ratio": float(pred_test.mean()), |
| "tp": int(((pred_val == 1) & (y == 1)).sum()), |
| "fp": int(((pred_val == 1) & (y == 0)).sum()), |
| "fn": int(((pred_val == 0) & (y == 1)).sum()), |
| "public_submission_path": str(agree_path), |
| "changed_predictions_vs_current_best": int((pred_test != anchor_pred).sum()), |
| } |
| ) |
| pd.Series(ctx["feature_names"] + agree_names).to_csv(out_dir / "agreement_feature_names.csv", index=False) |
|
|
| feature_rows = [] |
| full_model = fit_full_lgb(ctx["X"], y, args.seed + 111, params={"n_estimators": 900}) |
| importances = pd.DataFrame({"feature": ctx["feature_names"], "gain_importance": full_model.booster_.feature_importance(importance_type="gain"), "split_importance": full_model.booster_.feature_importance(importance_type="split")}) |
| importances.sort_values("gain_importance", ascending=False).to_csv(out_dir / "feature_importance_gain.csv", index=False) |
|
|
| _, sample_idx = train_test_split(np.arange(len(y)), test_size=min(8000, len(y) // 10), stratify=y, random_state=args.seed) |
| perm = permutation_importance(full_model, ctx["X"][sample_idx], y[sample_idx], scoring="f1", n_repeats=1, random_state=args.seed, n_jobs=1) |
| pd.DataFrame({"feature": ctx["feature_names"], "permutation_importance_mean": perm.importances_mean, "permutation_importance_std": perm.importances_std}).sort_values("permutation_importance_mean", ascending=False).to_csv(out_dir / "permutation_importance.csv", index=False) |
|
|
| low_gain = set(importances.sort_values("gain_importance").head(max(5, int(0.15 * len(ctx["feature_names"]))))["feature"]) |
| rw_features = [n for n in ctx["feature_names"] if n.startswith("deepwalk_") or n.startswith("node2vec_")] |
| rw_drop = set([n for n in rw_features if ("_z" in n or "_author_rank" in n)]) |
| prune_sets = { |
| "drop_low_gain_15pct": [i for i, n in enumerate(ctx["feature_names"]) if n not in low_gain], |
| "drop_rw_z_author_rank": [i for i, n in enumerate(ctx["feature_names"]) if n not in rw_drop], |
| "keep_raw_and_rank_no_z": [i for i, n in enumerate(ctx["feature_names"]) if not n.endswith("_z")], |
| } |
| for name, cols in prune_sets.items(): |
| Xp = ctx["X"][:, cols] |
| Xtp = ctx["X_test"][:, cols] |
| oof, models = fit_lgb_oof(Xp, y, args.seed + 131, args.n_splits) |
| f, th, auc, p, r = best_f1(y, oof) |
| test_pred_score = average_predict(models, Xtp) |
| path = sub_dir / f"submission_prune_{name}.csv" |
| pred_test = write_submission(path, test_pred_score, known, ratio=float((oof >= th).mean())) |
| pred_val = (oof >= th).astype(np.int8) |
| feature_rows.append({"stage": name, "validation_f1": f, "threshold": th, "auc": auc, "precision": p, "recall": r, "n_features": len(cols), "public_submission_path": str(path), "test_predicted_positive_ratio": float(pred_test.mean()), "changed_predictions_vs_current_best": int((pred_test != anchor_pred).sum())}) |
| overall_rows.append( |
| { |
| "experiment": f"feature_pruning_{name}", |
| "validation_f1": f, |
| "precision": p, |
| "recall": r, |
| "predicted_positive_ratio": float(pred_test.mean()), |
| "tp": int(((pred_val == 1) & (y == 1)).sum()), |
| "fp": int(((pred_val == 1) & (y == 0)).sum()), |
| "fn": int(((pred_val == 0) & (y == 1)).sum()), |
| "public_submission_path": str(path), |
| "changed_predictions_vs_current_best": int((pred_test != anchor_pred).sum()), |
| } |
| ) |
| pd.DataFrame(feature_rows).to_csv(out_dir / "feature_pruning.csv", index=False) |
|
|
| pd.DataFrame(overall_rows).sort_values("validation_f1", ascending=False).to_csv(out_dir / "experiment_summary.csv", index=False) |
| with (out_dir / "run_notes.txt").open("w") as f: |
| f.write(f"current_anchor_public_submission={anchor_path}\n") |
| f.write(f"current_anchor_validation_at_th0.480={best_anchor}\n") |
| f.write("All model-selection metrics are validation OOF; submission paths are candidates only.\n") |
| print(pd.DataFrame(overall_rows).sort_values("validation_f1", ascending=False).to_string(index=False)) |
|
|
|
|
| if __name__ == "__main__": |
| main() |
|
|