File size: 14,119 Bytes

f28d994

"""Post-0.95 incremental ablations for the hybrid stacker."""

from __future__ import annotations

import argparse
import importlib.util
import pickle as pkl
from pathlib import Path

import lightgbm as lgb
import numpy as np
import pandas as pd
from sklearn.metrics import precision_recall_curve, roc_auc_score
from sklearn.model_selection import StratifiedKFold


def load_module(name: str, path: Path):
    spec = importlib.util.spec_from_file_location(name, path)
    module = importlib.util.module_from_spec(spec)
    assert spec.loader is not None
    spec.loader.exec_module(module)
    return module


def best_f1(y: np.ndarray, s: np.ndarray):
    p, r, t = precision_recall_curve(y, s)
    f = 2 * p * r / (p + r + 1e-12)
    i = int(np.argmax(f))
    th = float(t[i]) if i < len(t) else 0.5
    return float(f[i]), th, float(roc_auc_score(y, s)), float(p[i]), float(r[i])


def prf(y: np.ndarray, pred: np.ndarray):
    tp = int(((pred == 1) & (y == 1)).sum())
    fp = int(((pred == 1) & (y == 0)).sum())
    fn = int(((pred == 0) & (y == 1)).sum())
    precision = tp / (tp + fp + 1e-12)
    recall = tp / (tp + fn + 1e-12)
    f1 = 2 * precision * recall / (precision + recall + 1e-12)
    return precision, recall, f1, tp, fp, fn


def rank01(x: np.ndarray) -> np.ndarray:
    order = np.argsort(x, kind="mergesort")
    out = np.empty(len(x), dtype=np.float32)
    out[order] = np.linspace(0.0, 1.0, len(x), dtype=np.float32)
    return out


def zscore(x: np.ndarray) -> np.ndarray:
    return ((x - x.mean()) / (x.std() + 1e-8)).astype(np.float32)


def fit_lgb_oof(X: np.ndarray, y: np.ndarray, seed: int, n_splits: int) -> np.ndarray:
    oof = np.zeros(len(y), dtype=np.float32)
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=seed)
    for fold, (tr, va) in enumerate(skf.split(X, y), start=1):
        clf = lgb.LGBMClassifier(
            n_estimators=1200,
            learning_rate=0.025,
            num_leaves=31,
            subsample=0.9,
            colsample_bytree=0.9,
            reg_lambda=5.0,
            min_child_samples=80,
            objective="binary",
            verbose=-1,
            random_state=seed + fold,
        )
        clf.fit(X[tr], y[tr])
        oof[va] = clf.predict_proba(X[va])[:, 1]
    return oof


def bucket_series(values: np.ndarray, name: str, bins: list[float]) -> pd.Categorical:
    labels = []
    for lo, hi in zip(bins[:-1], bins[1:]):
        left = "-inf" if np.isneginf(lo) else f"{lo:g}"
        right = "inf" if np.isposinf(hi) else f"{hi:g}"
        labels.append(f"{name}[{left},{right})")
    return pd.cut(values, bins=bins, labels=labels, include_lowest=True, right=False)


def error_analysis(
    y: np.ndarray,
    score: np.ndarray,
    pred: np.ndarray,
    pairs: np.ndarray,
    X_hand: np.ndarray,
    score_lgcn: np.ndarray,
    author_internal_rank: np.ndarray,
    out_dir: Path,
):
    author_degree = X_hand[:, 0]
    paper_degree = X_hand[:, 1]
    author_rank = pd.Series(pairs[:, 0]).map(pd.Series(np.arange(len(pairs)), index=pairs[:, 0]).groupby(level=0).count()).to_numpy()
    buckets = {
        "author_degree": bucket_series(author_degree, "author_degree", [-np.inf, 1, 3, 8, 20, 50, np.inf]),
        "paper_degree": bucket_series(paper_degree, "paper_degree", [-np.inf, 1, 3, 10, 30, 100, np.inf]),
        "score_lgcn": pd.qcut(score_lgcn, q=10, duplicates="drop"),
        "author_internal_rank": bucket_series(author_internal_rank, "author_internal_rank", [-np.inf, 1, 3, 5, 10, 20, 50, np.inf]),
        "author_candidate_count": bucket_series(author_rank.astype(np.float32), "author_candidate_count", [-np.inf, 5, 10, 20, 50, 100, np.inf]),
    }
    rows = []
    for name, cats in buckets.items():
        for cat in pd.Series(cats).dropna().unique():
            mask = np.asarray(cats == cat)
            if mask.sum() == 0:
                continue
            precision, recall, f1, tp, fp, fn = prf(y[mask], pred[mask])
            rows.append(
                {
                    "bucket_type": name,
                    "bucket": str(cat),
                    "n": int(mask.sum()),
                    "positives": int(y[mask].sum()),
                    "pred_pos": int(pred[mask].sum()),
                    "fp": fp,
                    "fn": fn,
                    "precision": precision,
                    "recall": recall,
                    "f1": f1,
                }
            )
    df = pd.DataFrame(rows)
    df.to_csv(out_dir / "error_analysis_buckets.csv", index=False)
    print("\nError analysis buckets:")
    print(df.to_string(index=False, max_rows=80))


def group_threshold(y: np.ndarray, score: np.ndarray, groups: np.ndarray):
    pred = np.zeros(len(y), dtype=np.int8)
    thresholds = {}
    for g in pd.Series(groups).dropna().unique():
        mask = np.asarray(groups == g)
        if mask.sum() == 0:
            continue
        _, th, _, _, _ = best_f1(y[mask], score[mask])
        pred[mask] = (score[mask] >= th).astype(np.int8)
        thresholds[str(g)] = float(th)
    precision, recall, f1, *_ = prf(y, pred)
    return f1, precision, recall, thresholds, pred


def author_quota_tuning(y: np.ndarray, score: np.ndarray, pairs: np.ndarray, author_degree: np.ndarray):
    buckets = bucket_series(author_degree, "author_degree", [-np.inf, 1, 3, 8, 20, 50, np.inf])
    best = None
    for base in np.linspace(0.46, 0.54, 17):
        pred = np.zeros(len(y), dtype=np.int8)
        df = pd.DataFrame({"idx": np.arange(len(y)), "author": pairs[:, 0], "score": score, "bucket": buckets})
        # Slightly more permissive for active authors.
        bucket_adj = {
            "author_degree[-inf,1)": -0.04,
            "author_degree[1,3)": -0.02,
            "author_degree[3,8)": 0.00,
            "author_degree[8,20)": 0.01,
            "author_degree[20,50)": 0.02,
            "author_degree[50,inf)": 0.03,
        }
        for _, g in df.groupby("author", sort=False):
            b = str(g["bucket"].iloc[0])
            ratio = min(0.80, max(0.05, base + bucket_adj.get(b, 0.0)))
            k = int(round(len(g) * ratio))
            if k <= 0:
                continue
            idx = g["idx"].to_numpy()
            local = np.argsort(g["score"].to_numpy())[-k:]
            pred[idx[local]] = 1
        precision, recall, f1, *_ = prf(y, pred)
        row = {"base_ratio": float(base), "f1": f1, "precision": precision, "recall": recall, "pred_ratio": float(pred.mean())}
        if best is None or f1 > best["f1"]:
            best = row
    return best


def negative_evidence_features(X_hand: np.ndarray, score_lgcn: np.ndarray) -> np.ndarray:
    paper_degree = X_hand[:, 1]
    local_overlap = X_hand[:, 3] + X_hand[:, 7] + X_hand[:, 8] + X_hand[:, 12] + X_hand[:, 13] + X_hand[:, 14]
    has_any = (local_overlap > 0).astype(np.float32)
    paper_pct = rank01(paper_degree)
    return np.column_stack(
        [
            has_any,
            score_lgcn * has_any,
            score_lgcn * (1.0 - has_any),
            score_lgcn / np.log1p(paper_degree + 1.0),
            paper_pct,
            paper_degree * X_hand[:, 7],
            paper_degree * X_hand[:, 8],
            paper_degree * X_hand[:, 13],
        ]
    ).astype(np.float32)


def topk_content_similarity(root: Path, pairs: np.ndarray, builder) -> np.ndarray:
    cache = root / "validation_runs" / "feature_cache"
    cache.mkdir(parents=True, exist_ok=True)
    key = f"topk_content_{len(pairs)}_{int(pairs[:,0].sum())}_{int(pairs[:,1].sum())}.npy"
    path = cache / key
    if path.exists():
        return np.load(path)
    with (root / "data_and_docs" / "feature.pkl").open("rb") as f:
        feat = pkl.load(f).numpy().astype(np.float32)
    feat /= np.linalg.norm(feat, axis=1, keepdims=True) + 1e-8
    out = np.zeros((len(pairs), 3), dtype=np.float32)
    for i, (a_raw, p_raw) in enumerate(pairs):
        papers = list(builder.author_papers[int(a_raw)])
        if not papers:
            continue
        sims = feat[np.asarray(papers, dtype=np.int64)] @ feat[int(p_raw)]
        sims.sort()
        vals = sims[::-1]
        out[i, 0] = vals[0]
        out[i, 1] = vals[: min(3, len(vals))].mean()
        out[i, 2] = vals[: min(5, len(vals))].mean()
    np.save(path, out)
    return out


def load_lgcn_variant_scores(root: Path, split_seed: int, y: np.ndarray, max_cols: int = 20):
    files = sorted((root / "validation_runs" / f"dynamic_seed{split_seed}").glob("dyn*/scores/val_*.npy"))
    rows = []
    for p in files:
        if "hgt" in str(p) or "sage" in str(p) or "bce" in str(p) or "norm" in str(p) or "hinge" in str(p):
            continue
        x = np.load(p).astype(np.float32)
        if len(x) != len(y) or np.std(x) < 1e-8:
            continue
        f1, th, auc, _, _ = best_f1(y, x)
        rows.append((f1, auc, str(p), x))
    rows.sort(key=lambda r: r[0], reverse=True)
    chosen = rows[:max_cols]
    if not chosen:
        return np.zeros((len(y), 0), dtype=np.float32), []
    cols = []
    names = []
    raw_stack = []
    for _, _, name, x in chosen:
        raw_stack.append(x)
        cols.extend([zscore(x), rank01(x)])
        names.extend([name + "::z", name + "::rank"])
    raw = np.vstack(raw_stack)
    cols.extend([zscore(raw.mean(axis=0)), zscore(raw.std(axis=0)), rank01(raw.mean(axis=0))])
    names.extend(["lgcn_variant_mean_z", "lgcn_variant_std_z", "lgcn_variant_mean_rank"])
    return np.column_stack(cols).astype(np.float32), names


def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("--package-root", type=Path, default=Path(__file__).resolve().parents[1])
    parser.add_argument("--split-seed", type=int, required=True)
    parser.add_argument("--lgcn-score-file", type=Path, required=True)
    parser.add_argument("--n-splits", type=int, default=5)
    parser.add_argument("--seed", type=int, default=0)
    args = parser.parse_args()

    root = args.package_root
    stack_mod = load_module("stack_rank_calibration", root / "code" / "stack_rank_calibration.py")
    lgcn_mod = load_module("train_val_lgcn_ensemble", root / "code" / "train_val_lgcn_ensemble.py")
    train_refs, val_pairs = lgcn_mod.make_notebook_style_split(root, args.split_seed, 0.9)
    pairs = val_pairs[["source", "target"]].to_numpy(np.int64)
    y = val_pairs["label"].to_numpy(np.int8)
    score_lgcn = np.load(args.lgcn_score_file).astype(np.float32)
    builder = stack_mod.ExplicitGraphFeatures(root, train_refs)
    out_dir = root / "validation_runs" / f"dynamic_seed{args.split_seed}" / "post95_ablation"
    out_dir.mkdir(parents=True, exist_ok=True)

    print("building baseline handcrafted/rank features")
    X_hand = builder.transform(pairs)
    X_rank = stack_mod.add_rank_features(pairs, score_lgcn)
    X_base = np.column_stack([X_rank, X_hand]).astype(np.float32)

    rows = []
    base_oof = fit_lgb_oof(X_base, y, args.seed, args.n_splits)
    f1, th, auc, precision, recall = best_f1(y, base_oof)
    rows.append({"stage": "baseline_stacking", "f1": f1, "threshold": th, "auc": auc, "precision": precision, "recall": recall, "n_features": X_base.shape[1]})
    base_pred = (base_oof >= th).astype(np.int8)
    error_analysis(y, base_oof, base_pred, pairs, X_hand, score_lgcn, X_rank[:, 3], out_dir)

    # Group threshold tuning on baseline OOF scores.
    author_bucket = bucket_series(X_hand[:, 0], "author_degree", [-np.inf, 1, 3, 8, 20, 50, np.inf])
    score_bucket = pd.qcut(score_lgcn, q=10, duplicates="drop")
    for name, group in [("group_threshold_author_degree", author_bucket), ("group_threshold_score_lgcn", score_bucket)]:
        gf1, gp, gr, thresholds, _ = group_threshold(y, base_oof, np.asarray(group))
        rows.append({"stage": name, "f1": gf1, "threshold": np.nan, "auc": auc, "precision": gp, "recall": gr, "n_features": X_base.shape[1]})
        pd.Series(thresholds).to_csv(out_dir / f"{name}_thresholds.csv")
    quota = author_quota_tuning(y, base_oof, pairs, X_hand[:, 0])
    rows.append({"stage": "author_quota_by_degree", "f1": quota["f1"], "threshold": quota["base_ratio"], "auc": np.nan, "precision": quota["precision"], "recall": quota["recall"], "n_features": X_base.shape[1]})

    print("adding negative-evidence features")
    X_neg = np.column_stack([X_base, negative_evidence_features(X_hand, score_lgcn)]).astype(np.float32)
    neg_oof = fit_lgb_oof(X_neg, y, args.seed + 11, args.n_splits)
    f1, th, auc, precision, recall = best_f1(y, neg_oof)
    rows.append({"stage": "negative_evidence_features", "f1": f1, "threshold": th, "auc": auc, "precision": precision, "recall": recall, "n_features": X_neg.shape[1]})

    print("adding top-k content similarity features")
    X_sim = np.column_stack([X_neg, topk_content_similarity(root, pairs, builder)]).astype(np.float32)
    sim_oof = fit_lgb_oof(X_sim, y, args.seed + 22, args.n_splits)
    f1, th, auc, precision, recall = best_f1(y, sim_oof)
    rows.append({"stage": "topk_similarity_features", "f1": f1, "threshold": th, "auc": auc, "precision": precision, "recall": recall, "n_features": X_sim.shape[1]})

    print("adding multi-LightGCN variant score features")
    X_var, names = load_lgcn_variant_scores(root, args.split_seed, y)
    (out_dir / "lgcn_variant_feature_names.txt").write_text("\n".join(names) + "\n")
    X_ens = np.column_stack([X_sim, X_var]).astype(np.float32)
    ens_oof = fit_lgb_oof(X_ens, y, args.seed + 33, args.n_splits)
    f1, th, auc, precision, recall = best_f1(y, ens_oof)
    rows.append({"stage": "ensemble_lgcn_score_features", "f1": f1, "threshold": th, "auc": auc, "precision": precision, "recall": recall, "n_features": X_ens.shape[1]})

    result = pd.DataFrame(rows).sort_values("f1", ascending=False)
    result.to_csv(out_dir / "ablation_table.csv", index=False)
    np.save(out_dir / "baseline_oof.npy", base_oof)
    np.save(out_dir / "negative_oof.npy", neg_oof)
    np.save(out_dir / "similarity_oof.npy", sim_oof)
    np.save(out_dir / "ensemble_lgcn_oof.npy", ens_oof)
    print("\nAblation table:")
    print(result.to_string(index=False))


if __name__ == "__main__":
    main()