| |
| |
|
|
| """Statistical tests for discrete choice model comparison.""" |
|
|
| from __future__ import annotations |
|
|
| from dataclasses import dataclass |
|
|
| import numpy as np |
| import pandas as pd |
| from scipy.stats import chi2 |
|
|
| from .config import ModelSpec, VariableSpec |
| from .pipeline import estimate_dataframe |
|
|
|
|
| @dataclass |
| class SwaitLouviereResult: |
| """Result of the Swait-Louviere pooling test.""" |
|
|
| chi2_stat: float |
| df: int |
| p_value: float |
| ll_pooled: float |
| ll_group1: float |
| ll_group2: float |
| n_parameters: int |
| group1_label: str |
| group2_label: str |
| reject_null: bool |
|
|
| @property |
| def conclusion(self) -> str: |
| if self.reject_null: |
| return ( |
| f"Reject H0 at 5% level (p={self.p_value:.4f}). " |
| "The two groups have significantly different preference structures." |
| ) |
| return ( |
| f"Fail to reject H0 at 5% level (p={self.p_value:.4f}). " |
| "No significant difference in preferences between the two groups." |
| ) |
|
|
|
|
| def swait_louviere_test( |
| df: pd.DataFrame, |
| spec: ModelSpec, |
| grouping_col: str, |
| model_type: str = "conditional", |
| maxiter: int = 200, |
| seed: int = 123, |
| n_classes: int = 2, |
| n_starts: int = 10, |
| ) -> SwaitLouviereResult: |
| """ |
| Swait-Louviere pooling test for preference heterogeneity across groups. |
| |
| Tests whether two groups of respondents share the same preference parameters |
| by comparing the pooled log-likelihood to the sum of group-specific |
| log-likelihoods. |
| |
| Test statistic: -2 * [LL_pooled - (LL_group1 + LL_group2)] |
| Distributed as chi-squared with K degrees of freedom (K = number of parameters). |
| |
| Parameters |
| ---------- |
| df : pd.DataFrame |
| Long-format choice data. |
| spec : ModelSpec |
| Model specification (column roles and variables). |
| grouping_col : str |
| Column used to split data into two groups. Must have exactly two unique values. |
| model_type : str |
| Type of model to estimate ('conditional', 'mixed', or 'latent_class'). |
| maxiter : int |
| Maximum optimizer iterations. |
| seed : int |
| Random seed for estimation. |
| |
| Returns |
| ------- |
| SwaitLouviereResult |
| """ |
| if grouping_col not in df.columns: |
| raise ValueError(f"Grouping column '{grouping_col}' not found in data.") |
|
|
| groups = df[grouping_col].dropna().unique() |
| if len(groups) != 2: |
| raise ValueError( |
| f"Grouping column must have exactly 2 unique values, found {len(groups)}: {groups}" |
| ) |
|
|
| group1_label, group2_label = str(groups[0]), str(groups[1]) |
| df_g1 = df[df[grouping_col] == groups[0]].copy() |
| df_g2 = df[df[grouping_col] == groups[1]].copy() |
|
|
| est_kwargs = dict(spec=spec, model_type=model_type, maxiter=maxiter, seed=seed) |
| if model_type == "latent_class": |
| est_kwargs["n_classes"] = n_classes |
| est_kwargs["n_starts"] = n_starts |
|
|
| |
| result_pooled = estimate_dataframe(df=df, **est_kwargs) |
| result_g1 = estimate_dataframe(df=df_g1, **est_kwargs) |
| result_g2 = estimate_dataframe(df=df_g2, **est_kwargs) |
|
|
| ll_pooled = result_pooled.estimation.log_likelihood |
| ll_g1 = result_g1.estimation.log_likelihood |
| ll_g2 = result_g2.estimation.log_likelihood |
| k = result_pooled.estimation.n_parameters |
|
|
| chi2_stat = -2.0 * (ll_pooled - (ll_g1 + ll_g2)) |
| p_value = float(1.0 - chi2.cdf(chi2_stat, k)) |
|
|
| return SwaitLouviereResult( |
| chi2_stat=chi2_stat, |
| df=k, |
| p_value=p_value, |
| ll_pooled=ll_pooled, |
| ll_group1=ll_g1, |
| ll_group2=ll_g2, |
| n_parameters=k, |
| group1_label=group1_label, |
| group2_label=group2_label, |
| reject_null=p_value < 0.05, |
| ) |
|
|