Spaces:

Wil2200
/

prefero

Sleeping

File size: 6,444 Bytes

# Copyright (C) 2026 Hengzhe Zhao. All rights reserved.
# Licensed under dual license: AGPL-3.0 (open-source) or commercial. See LICENSE.

"""Non-parametric bootstrap for discrete-choice model inference."""

from __future__ import annotations

from dataclasses import dataclass
from typing import Any, Callable

import numpy as np
import pandas as pd

from .config import ModelSpec
from .pipeline import estimate_dataframe


@dataclass
class BootstrapResult:
    """Stores the results of a bootstrap procedure."""

    n_replications: int
    n_successful: int
    param_names: list[str]
    estimates_matrix: np.ndarray  # (n_successful, n_params) — each row is one replication
    bootstrap_se: dict[str, float]
    percentile_ci: dict[str, tuple[float, float]]  # 95% CI per parameter
    original_estimates: dict[str, float]

    def summary_dataframe(self) -> pd.DataFrame:
        rows = []
        for name in self.param_names:
            rows.append(
                {
                    "parameter": name,
                    "original": self.original_estimates.get(name, float("nan")),
                    "bootstrap_se": self.bootstrap_se.get(name, float("nan")),
                    "ci_lower": self.percentile_ci[name][0] if name in self.percentile_ci else float("nan"),
                    "ci_upper": self.percentile_ci[name][1] if name in self.percentile_ci else float("nan"),
                }
            )
        return pd.DataFrame(rows)


def _resample_individuals(df: pd.DataFrame, id_col: str, rng: np.random.Generator) -> pd.DataFrame:
    """Resample individuals with replacement, keeping all tasks per individual."""
    unique_ids = df[id_col].unique()
    sampled_ids = rng.choice(unique_ids, size=len(unique_ids), replace=True)

    parts = []
    for new_idx, orig_id in enumerate(sampled_ids):
        chunk = df[df[id_col] == orig_id].copy()
        chunk[id_col] = new_idx
        parts.append(chunk)

    return pd.concat(parts, ignore_index=True)


def run_bootstrap(
    df: pd.DataFrame,
    spec: ModelSpec,
    model_type: str = "mixed",
    n_replications: int = 100,
    maxiter: int = 200,
    seed: int = 42,
    progress_callback: Callable[[int, int], None] | None = None,
    *,
    correlated: bool = False,
    correlation_groups: list[list[int]] | None = None,
    n_classes: int | None = None,
    n_starts: int = 10,
    membership_cols: list[str] | None = None,
    bws_worst_col: str | None = None,
    estimate_lambda_w: bool = True,
) -> BootstrapResult:
    """
    Run non-parametric bootstrap by resampling individuals with replacement.

    Parameters
    ----------
    df : pd.DataFrame
        Long-format choice data.
    spec : ModelSpec
        Model specification.
    model_type : str
        "mixed", "conditional", "gmnl", or "latent_class".
    n_replications : int
        Number of bootstrap replications.
    maxiter : int
        Max optimizer iterations per replication.
    seed : int
        Base seed for reproducibility.
    progress_callback : callable, optional
        Called with (current_replication, n_replications) after each replication.
    correlated : bool
        Enable full correlation (Cholesky) for random parameters.
    correlation_groups : list[list[int]], optional
        Selective correlation groups (block-diagonal Cholesky).
    n_classes : int, optional
        Number of latent classes (for latent_class model type).
    n_starts : int
        Number of random starts (for latent_class).
    membership_cols : list[str], optional
        Membership covariates (for latent_class).
    bws_worst_col : str, optional
        Column name for BWS worst choices.
    estimate_lambda_w : bool
        Whether to estimate lambda_w for BWS.

    Returns
    -------
    BootstrapResult
    """
    rng = np.random.default_rng(seed)

    # Build extra kwargs for estimate_dataframe
    extra_kwargs: dict[str, Any] = {}
    if correlated:
        extra_kwargs["correlated"] = True
    if correlation_groups is not None:
        extra_kwargs["correlation_groups"] = correlation_groups
    if n_classes is not None:
        extra_kwargs["n_classes"] = n_classes
    if n_starts != 10:
        extra_kwargs["n_starts"] = n_starts
    if membership_cols:
        extra_kwargs["membership_cols"] = membership_cols
    if bws_worst_col:
        extra_kwargs["bws_worst_col"] = bws_worst_col
        extra_kwargs["estimate_lambda_w"] = estimate_lambda_w

    # run original estimation for reference
    original = estimate_dataframe(
        df, spec, model_type=model_type, maxiter=maxiter, seed=seed, **extra_kwargs,
    )
    original_est = original.estimation
    param_names = original_est.estimates["parameter"].tolist()
    original_values = dict(
        zip(original_est.estimates["parameter"], original_est.estimates["estimate"])
    )

    all_estimates: list[np.ndarray] = []
    n_successful = 0

    for b in range(n_replications):
        rep_seed = int(rng.integers(0, 2**31))
        resampled = _resample_individuals(df, spec.id_col, np.random.default_rng(rep_seed))

        try:
            result = estimate_dataframe(
                resampled, spec, model_type=model_type, maxiter=maxiter,
                seed=rep_seed, **extra_kwargs,
            )
            est_values = result.estimation.estimates["estimate"].to_numpy()
            all_estimates.append(est_values)
            n_successful += 1
        except Exception:
            pass  # skip failed replications

        if progress_callback is not None:
            progress_callback(b + 1, n_replications)

    if n_successful < 2:
        raise RuntimeError(
            f"Only {n_successful} of {n_replications} bootstrap replications succeeded. "
            "Cannot compute bootstrap statistics."
        )

    estimates_matrix = np.array(all_estimates)  # (n_successful, n_params)

    bootstrap_se = {}
    percentile_ci = {}
    for i, name in enumerate(param_names):
        col = estimates_matrix[:, i]
        bootstrap_se[name] = float(np.std(col, ddof=1))
        percentile_ci[name] = (float(np.percentile(col, 2.5)), float(np.percentile(col, 97.5)))

    return BootstrapResult(
        n_replications=n_replications,
        n_successful=n_successful,
        param_names=param_names,
        estimates_matrix=estimates_matrix,
        bootstrap_se=bootstrap_se,
        percentile_ci=percentile_ci,
        original_estimates=original_values,
    )