"""Post-0.95 incremental ablations for the hybrid stacker."""

from __future__ import annotations

import argparse
import importlib.util
import pickle as pkl
from pathlib import Path

import lightgbm as lgb
import numpy as np
import pandas as pd
from sklearn.metrics import precision_recall_curve, roc_auc_score
from sklearn.model_selection import StratifiedKFold


def load_module(name: str, path: Path):
    spec = importlib.util.spec_from_file_location(name, path)
    module = importlib.util.module_from_spec(spec)
    assert spec.loader is not None
    spec.loader.exec_module(module)
    return module


def best_f1(y: np.ndarray, s: np.ndarray):
    p, r, t = precision_recall_curve(y, s)
    f = 2 * p * r / (p + r + 1e-12)
    i = int(np.argmax(f))
    th = float(t[i]) if i < len(t) else 0.5
    return float(f[i]), th, float(roc_auc_score(y, s)), float(p[i]), float(r[i])


def prf(y: np.ndarray, pred: np.ndarray):
    tp = int(((pred == 1) & (y == 1)).sum())
    fp = int(((pred == 1) & (y == 0)).sum())
    fn = int(((pred == 0) & (y == 1)).sum())
    precision = tp / (tp + fp + 1e-12)
    recall = tp / (tp + fn + 1e-12)
    f1 = 2 * precision * recall / (precision + recall + 1e-12)
    return precision, recall, f1, tp, fp, fn


def rank01(x: np.ndarray) -> np.ndarray:
    order = np.argsort(x, kind="mergesort")
    out = np.empty(len(x), dtype=np.float32)
    out[order] = np.linspace(0.0, 1.0, len(x), dtype=np.float32)
    return out


def zscore(x: np.ndarray) -> np.ndarray:
    return ((x - x.mean()) / (x.std() + 1e-8)).astype(np.float32)


def fit_lgb_oof(X: np.ndarray, y: np.ndarray, seed: int, n_splits: int) -> np.ndarray:
    oof = np.zeros(len(y), dtype=np.float32)
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=seed)
    for fold, (tr, va) in enumerate(skf.split(X, y), start=1):
        clf = lgb.LGBMClassifier(
            n_estimators=1200,
            learning_rate=0.025,
            num_leaves=31,
            subsample=0.9,
            colsample_bytree=0.9,
            reg_lambda=5.0,
            min_child_samples=80,
            objective="binary",
            verbose=-1,
            random_state=seed + fold,
        )
        clf.fit(X[tr], y[tr])
        oof[va] = clf.predict_proba(X[va])[:, 1]
    return oof


def bucket_series(values: np.ndarray, name: str, bins: list[float]) -> pd.Categorical:
    labels = []
    for lo, hi in zip(bins[:-1], bins[1:]):
        left = "-inf" if np.isneginf(lo) else f"{lo:g}"
        right = "inf" if np.isposinf(hi) else f"{hi:g}"
        labels.append(f"{name}[{left},{right})")
    return pd.cut(values, bins=bins, labels=labels, include_lowest=True, right=False)


def error_analysis(
    y: np.ndarray,
    score: np.ndarray,
    pred: np.ndarray,
    pairs: np.ndarray,
    X_hand: np.ndarray,
    score_lgcn: np.ndarray,
    author_internal_rank: np.ndarray,
    out_dir: Path,
):
    author_degree = X_hand[:, 0]
    paper_degree = X_hand[:, 1]
    author_rank = pd.Series(pairs[:, 0]).map(pd.Series(np.arange(len(pairs)), index=pairs[:, 0]).groupby(level=0).count()).to_numpy()
    buckets = {
        "author_degree": bucket_series(author_degree, "author_degree", [-np.inf, 1, 3, 8, 20, 50, np.inf]),
        "paper_degree": bucket_series(paper_degree, "paper_degree", [-np.inf, 1, 3, 10, 30, 100, np.inf]),
        "score_lgcn": pd.qcut(score_lgcn, q=10, duplicates="drop"),
        "author_internal_rank": bucket_series(author_internal_rank, "author_internal_rank", [-np.inf, 1, 3, 5, 10, 20, 50, np.inf]),
        "author_candidate_count": bucket_series(author_rank.astype(np.float32), "author_candidate_count", [-np.inf, 5, 10, 20, 50, 100, np.inf]),
    }
    rows = []
    for name, cats in buckets.items():
        for cat in pd.Series(cats).dropna().unique():
            mask = np.asarray(cats == cat)
            if mask.sum() == 0:
                continue
            precision, recall, f1, tp, fp, fn = prf(y[mask], pred[mask])
            rows.append(
                {
                    "bucket_type": name,
                    "bucket": str(cat),
                    "n": int(mask.sum()),
                    "positives": int(y[mask].sum()),
                    "pred_pos": int(pred[mask].sum()),
                    "fp": fp,
                    "fn": fn,
                    "precision": precision,
                    "recall": recall,
                    "f1": f1,
                }
            )
    df = pd.DataFrame(rows)
    df.to_csv(out_dir / "error_analysis_buckets.csv", index=False)
    print("\nError analysis buckets:")
    print(df.to_string(index=False, max_rows=80))


def group_threshold(y: np.ndarray, score: np.ndarray, groups: np.ndarray):
    pred = np.zeros(len(y), dtype=np.int8)
    thresholds = {}
    for g in pd.Series(groups).dropna().unique():
        mask = np.asarray(groups == g)
        if mask.sum() == 0:
            continue
        _, th, _, _, _ = best_f1(y[mask], score[mask])
        pred[mask] = (score[mask] >= th).astype(np.int8)
        thresholds[str(g)] = float(th)
    precision, recall, f1, *_ = prf(y, pred)
    return f1, precision, recall, thresholds, pred


def author_quota_tuning(y: np.ndarray, score: np.ndarray, pairs: np.ndarray, author_degree: np.ndarray):
    buckets = bucket_series(author_degree, "author_degree", [-np.inf, 1, 3, 8, 20, 50, np.inf])
    best = None
    for base in np.linspace(0.46, 0.54, 17):
        pred = np.zeros(len(y), dtype=np.int8)
        df = pd.DataFrame({"idx": np.arange(len(y)), "author": pairs[:, 0], "score": score, "bucket": buckets})
        # Slightly more permissive for active authors.
        bucket_adj = {
            "author_degree[-inf,1)": -0.04,
            "author_degree[1,3)": -0.02,
            "author_degree[3,8)": 0.00,
            "author_degree[8,20)": 0.01,
            "author_degree[20,50)": 0.02,
            "author_degree[50,inf)": 0.03,
        }
        for _, g in df.groupby("author", sort=False):
            b = str(g["bucket"].iloc[0])
            ratio = min(0.80, max(0.05, base + bucket_adj.get(b, 0.0)))
            k = int(round(len(g) * ratio))
            if k <= 0:
                continue
            idx = g["idx"].to_numpy()
            local = np.argsort(g["score"].to_numpy())[-k:]
            pred[idx[local]] = 1
        precision, recall, f1, *_ = prf(y, pred)
        row = {"base_ratio": float(base), "f1": f1, "precision": precision, "recall": recall, "pred_ratio": float(pred.mean())}
        if best is None or f1 > best["f1"]:
            best = row
    return best


def negative_evidence_features(X_hand: np.ndarray, score_lgcn: np.ndarray) -> np.ndarray:
    paper_degree = X_hand[:, 1]
    local_overlap = X_hand[:, 3] + X_hand[:, 7] + X_hand[:, 8] + X_hand[:, 12] + X_hand[:, 13] + X_hand[:, 14]
    has_any = (local_overlap > 0).astype(np.float32)
    paper_pct = rank01(paper_degree)
    return np.column_stack(
        [
            has_any,
            score_lgcn * has_any,
            score_lgcn * (1.0 - has_any),
            score_lgcn / np.log1p(paper_degree + 1.0),
            paper_pct,
            paper_degree * X_hand[:, 7],
            paper_degree * X_hand[:, 8],
            paper_degree * X_hand[:, 13],
        ]
    ).astype(np.float32)


def topk_content_similarity(root: Path, pairs: np.ndarray, builder) -> np.ndarray:
    cache = root / "validation_runs" / "feature_cache"
    cache.mkdir(parents=True, exist_ok=True)
    key = f"topk_content_{len(pairs)}_{int(pairs[:,0].sum())}_{int(pairs[:,1].sum())}.npy"
    path = cache / key
    if path.exists():
        return np.load(path)
    with (root / "data_and_docs" / "feature.pkl").open("rb") as f:
        feat = pkl.load(f).numpy().astype(np.float32)
    feat /= np.linalg.norm(feat, axis=1, keepdims=True) + 1e-8
    out = np.zeros((len(pairs), 3), dtype=np.float32)
    for i, (a_raw, p_raw) in enumerate(pairs):
        papers = list(builder.author_papers[int(a_raw)])
        if not papers:
            continue
        sims = feat[np.asarray(papers, dtype=np.int64)] @ feat[int(p_raw)]
        sims.sort()
        vals = sims[::-1]
        out[i, 0] = vals[0]
        out[i, 1] = vals[: min(3, len(vals))].mean()
        out[i, 2] = vals[: min(5, len(vals))].mean()
    np.save(path, out)
    return out


def load_lgcn_variant_scores(root: Path, split_seed: int, y: np.ndarray, max_cols: int = 20):
    files = sorted((root / "validation_runs" / f"dynamic_seed{split_seed}").glob("dyn*/scores/val_*.npy"))
    rows = []
    for p in files:
        if "hgt" in str(p) or "sage" in str(p) or "bce" in str(p) or "norm" in str(p) or "hinge" in str(p):
            continue
        x = np.load(p).astype(np.float32)
        if len(x) != len(y) or np.std(x) < 1e-8:
            continue
        f1, th, auc, _, _ = best_f1(y, x)
        rows.append((f1, auc, str(p), x))
    rows.sort(key=lambda r: r[0], reverse=True)
    chosen = rows[:max_cols]
    if not chosen:
        return np.zeros((len(y), 0), dtype=np.float32), []
    cols = []
    names = []
    raw_stack = []
    for _, _, name, x in chosen:
        raw_stack.append(x)
        cols.extend([zscore(x), rank01(x)])
        names.extend([name + "::z", name + "::rank"])
    raw = np.vstack(raw_stack)
    cols.extend([zscore(raw.mean(axis=0)), zscore(raw.std(axis=0)), rank01(raw.mean(axis=0))])
    names.extend(["lgcn_variant_mean_z", "lgcn_variant_std_z", "lgcn_variant_mean_rank"])
    return np.column_stack(cols).astype(np.float32), names


def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("--package-root", type=Path, default=Path(__file__).resolve().parents[1])
    parser.add_argument("--split-seed", type=int, required=True)
    parser.add_argument("--lgcn-score-file", type=Path, required=True)
    parser.add_argument("--n-splits", type=int, default=5)
    parser.add_argument("--seed", type=int, default=0)
    args = parser.parse_args()

    root = args.package_root
    stack_mod = load_module("stack_rank_calibration", root / "code" / "stack_rank_calibration.py")
    lgcn_mod = load_module("train_val_lgcn_ensemble", root / "code" / "train_val_lgcn_ensemble.py")
    train_refs, val_pairs = lgcn_mod.make_notebook_style_split(root, args.split_seed, 0.9)
    pairs = val_pairs[["source", "target"]].to_numpy(np.int64)
    y = val_pairs["label"].to_numpy(np.int8)
    score_lgcn = np.load(args.lgcn_score_file).astype(np.float32)
    builder = stack_mod.ExplicitGraphFeatures(root, train_refs)
    out_dir = root / "validation_runs" / f"dynamic_seed{args.split_seed}" / "post95_ablation"
    out_dir.mkdir(parents=True, exist_ok=True)

    print("building baseline handcrafted/rank features")
    X_hand = builder.transform(pairs)
    X_rank = stack_mod.add_rank_features(pairs, score_lgcn)
    X_base = np.column_stack([X_rank, X_hand]).astype(np.float32)

    rows = []
    base_oof = fit_lgb_oof(X_base, y, args.seed, args.n_splits)
    f1, th, auc, precision, recall = best_f1(y, base_oof)
    rows.append({"stage": "baseline_stacking", "f1": f1, "threshold": th, "auc": auc, "precision": precision, "recall": recall, "n_features": X_base.shape[1]})
    base_pred = (base_oof >= th).astype(np.int8)
    error_analysis(y, base_oof, base_pred, pairs, X_hand, score_lgcn, X_rank[:, 3], out_dir)

    # Group threshold tuning on baseline OOF scores.
    author_bucket = bucket_series(X_hand[:, 0], "author_degree", [-np.inf, 1, 3, 8, 20, 50, np.inf])
    score_bucket = pd.qcut(score_lgcn, q=10, duplicates="drop")
    for name, group in [("group_threshold_author_degree", author_bucket), ("group_threshold_score_lgcn", score_bucket)]:
        gf1, gp, gr, thresholds, _ = group_threshold(y, base_oof, np.asarray(group))
        rows.append({"stage": name, "f1": gf1, "threshold": np.nan, "auc": auc, "precision": gp, "recall": gr, "n_features": X_base.shape[1]})
        pd.Series(thresholds).to_csv(out_dir / f"{name}_thresholds.csv")
    quota = author_quota_tuning(y, base_oof, pairs, X_hand[:, 0])
    rows.append({"stage": "author_quota_by_degree", "f1": quota["f1"], "threshold": quota["base_ratio"], "auc": np.nan, "precision": quota["precision"], "recall": quota["recall"], "n_features": X_base.shape[1]})

    print("adding negative-evidence features")
    X_neg = np.column_stack([X_base, negative_evidence_features(X_hand, score_lgcn)]).astype(np.float32)
    neg_oof = fit_lgb_oof(X_neg, y, args.seed + 11, args.n_splits)
    f1, th, auc, precision, recall = best_f1(y, neg_oof)
    rows.append({"stage": "negative_evidence_features", "f1": f1, "threshold": th, "auc": auc, "precision": precision, "recall": recall, "n_features": X_neg.shape[1]})

    print("adding top-k content similarity features")
    X_sim = np.column_stack([X_neg, topk_content_similarity(root, pairs, builder)]).astype(np.float32)
    sim_oof = fit_lgb_oof(X_sim, y, args.seed + 22, args.n_splits)
    f1, th, auc, precision, recall = best_f1(y, sim_oof)
    rows.append({"stage": "topk_similarity_features", "f1": f1, "threshold": th, "auc": auc, "precision": precision, "recall": recall, "n_features": X_sim.shape[1]})

    print("adding multi-LightGCN variant score features")
    X_var, names = load_lgcn_variant_scores(root, args.split_seed, y)
    (out_dir / "lgcn_variant_feature_names.txt").write_text("\n".join(names) + "\n")
    X_ens = np.column_stack([X_sim, X_var]).astype(np.float32)
    ens_oof = fit_lgb_oof(X_ens, y, args.seed + 33, args.n_splits)
    f1, th, auc, precision, recall = best_f1(y, ens_oof)
    rows.append({"stage": "ensemble_lgcn_score_features", "f1": f1, "threshold": th, "auc": auc, "precision": precision, "recall": recall, "n_features": X_ens.shape[1]})

    result = pd.DataFrame(rows).sort_values("f1", ascending=False)
    result.to_csv(out_dir / "ablation_table.csv", index=False)
    np.save(out_dir / "baseline_oof.npy", base_oof)
    np.save(out_dir / "negative_oof.npy", neg_oof)
    np.save(out_dir / "similarity_oof.npy", sim_oof)
    np.save(out_dir / "ensemble_lgcn_oof.npy", ens_oof)
    print("\nAblation table:")
    print(result.to_string(index=False))


if __name__ == "__main__":
    main()