| | |
| | |
| |
|
| | """Non-parametric bootstrap for discrete-choice model inference.""" |
| |
|
| | from __future__ import annotations |
| |
|
| | from dataclasses import dataclass |
| | from typing import Any, Callable |
| |
|
| | import numpy as np |
| | import pandas as pd |
| |
|
| | from .config import ModelSpec |
| | from .pipeline import estimate_dataframe |
| |
|
| |
|
| | @dataclass |
| | class BootstrapResult: |
| | """Stores the results of a bootstrap procedure.""" |
| |
|
| | n_replications: int |
| | n_successful: int |
| | param_names: list[str] |
| | estimates_matrix: np.ndarray |
| | bootstrap_se: dict[str, float] |
| | percentile_ci: dict[str, tuple[float, float]] |
| | original_estimates: dict[str, float] |
| |
|
| | def summary_dataframe(self) -> pd.DataFrame: |
| | rows = [] |
| | for name in self.param_names: |
| | rows.append( |
| | { |
| | "parameter": name, |
| | "original": self.original_estimates.get(name, float("nan")), |
| | "bootstrap_se": self.bootstrap_se.get(name, float("nan")), |
| | "ci_lower": self.percentile_ci[name][0] if name in self.percentile_ci else float("nan"), |
| | "ci_upper": self.percentile_ci[name][1] if name in self.percentile_ci else float("nan"), |
| | } |
| | ) |
| | return pd.DataFrame(rows) |
| |
|
| |
|
| | def _resample_individuals(df: pd.DataFrame, id_col: str, rng: np.random.Generator) -> pd.DataFrame: |
| | """Resample individuals with replacement, keeping all tasks per individual.""" |
| | unique_ids = df[id_col].unique() |
| | sampled_ids = rng.choice(unique_ids, size=len(unique_ids), replace=True) |
| |
|
| | parts = [] |
| | for new_idx, orig_id in enumerate(sampled_ids): |
| | chunk = df[df[id_col] == orig_id].copy() |
| | chunk[id_col] = new_idx |
| | parts.append(chunk) |
| |
|
| | return pd.concat(parts, ignore_index=True) |
| |
|
| |
|
| | def run_bootstrap( |
| | df: pd.DataFrame, |
| | spec: ModelSpec, |
| | model_type: str = "mixed", |
| | n_replications: int = 100, |
| | maxiter: int = 200, |
| | seed: int = 42, |
| | progress_callback: Callable[[int, int], None] | None = None, |
| | *, |
| | correlated: bool = False, |
| | correlation_groups: list[list[int]] | None = None, |
| | n_classes: int | None = None, |
| | n_starts: int = 10, |
| | membership_cols: list[str] | None = None, |
| | bws_worst_col: str | None = None, |
| | estimate_lambda_w: bool = True, |
| | ) -> BootstrapResult: |
| | """ |
| | Run non-parametric bootstrap by resampling individuals with replacement. |
| | |
| | Parameters |
| | ---------- |
| | df : pd.DataFrame |
| | Long-format choice data. |
| | spec : ModelSpec |
| | Model specification. |
| | model_type : str |
| | "mixed", "conditional", "gmnl", or "latent_class". |
| | n_replications : int |
| | Number of bootstrap replications. |
| | maxiter : int |
| | Max optimizer iterations per replication. |
| | seed : int |
| | Base seed for reproducibility. |
| | progress_callback : callable, optional |
| | Called with (current_replication, n_replications) after each replication. |
| | correlated : bool |
| | Enable full correlation (Cholesky) for random parameters. |
| | correlation_groups : list[list[int]], optional |
| | Selective correlation groups (block-diagonal Cholesky). |
| | n_classes : int, optional |
| | Number of latent classes (for latent_class model type). |
| | n_starts : int |
| | Number of random starts (for latent_class). |
| | membership_cols : list[str], optional |
| | Membership covariates (for latent_class). |
| | bws_worst_col : str, optional |
| | Column name for BWS worst choices. |
| | estimate_lambda_w : bool |
| | Whether to estimate lambda_w for BWS. |
| | |
| | Returns |
| | ------- |
| | BootstrapResult |
| | """ |
| | rng = np.random.default_rng(seed) |
| |
|
| | |
| | extra_kwargs: dict[str, Any] = {} |
| | if correlated: |
| | extra_kwargs["correlated"] = True |
| | if correlation_groups is not None: |
| | extra_kwargs["correlation_groups"] = correlation_groups |
| | if n_classes is not None: |
| | extra_kwargs["n_classes"] = n_classes |
| | if n_starts != 10: |
| | extra_kwargs["n_starts"] = n_starts |
| | if membership_cols: |
| | extra_kwargs["membership_cols"] = membership_cols |
| | if bws_worst_col: |
| | extra_kwargs["bws_worst_col"] = bws_worst_col |
| | extra_kwargs["estimate_lambda_w"] = estimate_lambda_w |
| |
|
| | |
| | original = estimate_dataframe( |
| | df, spec, model_type=model_type, maxiter=maxiter, seed=seed, **extra_kwargs, |
| | ) |
| | original_est = original.estimation |
| | param_names = original_est.estimates["parameter"].tolist() |
| | original_values = dict( |
| | zip(original_est.estimates["parameter"], original_est.estimates["estimate"]) |
| | ) |
| |
|
| | all_estimates: list[np.ndarray] = [] |
| | n_successful = 0 |
| |
|
| | for b in range(n_replications): |
| | rep_seed = int(rng.integers(0, 2**31)) |
| | resampled = _resample_individuals(df, spec.id_col, np.random.default_rng(rep_seed)) |
| |
|
| | try: |
| | result = estimate_dataframe( |
| | resampled, spec, model_type=model_type, maxiter=maxiter, |
| | seed=rep_seed, **extra_kwargs, |
| | ) |
| | est_values = result.estimation.estimates["estimate"].to_numpy() |
| | all_estimates.append(est_values) |
| | n_successful += 1 |
| | except Exception: |
| | pass |
| |
|
| | if progress_callback is not None: |
| | progress_callback(b + 1, n_replications) |
| |
|
| | if n_successful < 2: |
| | raise RuntimeError( |
| | f"Only {n_successful} of {n_replications} bootstrap replications succeeded. " |
| | "Cannot compute bootstrap statistics." |
| | ) |
| |
|
| | estimates_matrix = np.array(all_estimates) |
| |
|
| | bootstrap_se = {} |
| | percentile_ci = {} |
| | for i, name in enumerate(param_names): |
| | col = estimates_matrix[:, i] |
| | bootstrap_se[name] = float(np.std(col, ddof=1)) |
| | percentile_ci[name] = (float(np.percentile(col, 2.5)), float(np.percentile(col, 97.5))) |
| |
|
| | return BootstrapResult( |
| | n_replications=n_replications, |
| | n_successful=n_successful, |
| | param_names=param_names, |
| | estimates_matrix=estimates_matrix, |
| | bootstrap_se=bootstrap_se, |
| | percentile_ci=percentile_ci, |
| | original_estimates=original_values, |
| | ) |
| |
|