# Copyright (C) 2026 Hengzhe Zhao. All rights reserved. # Licensed under dual license: AGPL-3.0 (open-source) or commercial. See LICENSE. """Statistical tests for discrete choice model comparison.""" from __future__ import annotations from dataclasses import dataclass import numpy as np import pandas as pd from scipy.stats import chi2 from .config import ModelSpec, VariableSpec from .pipeline import estimate_dataframe @dataclass class SwaitLouviereResult: """Result of the Swait-Louviere pooling test.""" chi2_stat: float df: int p_value: float ll_pooled: float ll_group1: float ll_group2: float n_parameters: int group1_label: str group2_label: str reject_null: bool # at 5% level @property def conclusion(self) -> str: if self.reject_null: return ( f"Reject H0 at 5% level (p={self.p_value:.4f}). " "The two groups have significantly different preference structures." ) return ( f"Fail to reject H0 at 5% level (p={self.p_value:.4f}). " "No significant difference in preferences between the two groups." ) def swait_louviere_test( df: pd.DataFrame, spec: ModelSpec, grouping_col: str, model_type: str = "conditional", maxiter: int = 200, seed: int = 123, n_classes: int = 2, n_starts: int = 10, ) -> SwaitLouviereResult: """ Swait-Louviere pooling test for preference heterogeneity across groups. Tests whether two groups of respondents share the same preference parameters by comparing the pooled log-likelihood to the sum of group-specific log-likelihoods. Test statistic: -2 * [LL_pooled - (LL_group1 + LL_group2)] Distributed as chi-squared with K degrees of freedom (K = number of parameters). Parameters ---------- df : pd.DataFrame Long-format choice data. spec : ModelSpec Model specification (column roles and variables). grouping_col : str Column used to split data into two groups. Must have exactly two unique values. model_type : str Type of model to estimate ('conditional', 'mixed', or 'latent_class'). maxiter : int Maximum optimizer iterations. seed : int Random seed for estimation. Returns ------- SwaitLouviereResult """ if grouping_col not in df.columns: raise ValueError(f"Grouping column '{grouping_col}' not found in data.") groups = df[grouping_col].dropna().unique() if len(groups) != 2: raise ValueError( f"Grouping column must have exactly 2 unique values, found {len(groups)}: {groups}" ) group1_label, group2_label = str(groups[0]), str(groups[1]) df_g1 = df[df[grouping_col] == groups[0]].copy() df_g2 = df[df[grouping_col] == groups[1]].copy() est_kwargs = dict(spec=spec, model_type=model_type, maxiter=maxiter, seed=seed) if model_type == "latent_class": est_kwargs["n_classes"] = n_classes est_kwargs["n_starts"] = n_starts # Estimate on each group and pooled result_pooled = estimate_dataframe(df=df, **est_kwargs) result_g1 = estimate_dataframe(df=df_g1, **est_kwargs) result_g2 = estimate_dataframe(df=df_g2, **est_kwargs) ll_pooled = result_pooled.estimation.log_likelihood ll_g1 = result_g1.estimation.log_likelihood ll_g2 = result_g2.estimation.log_likelihood k = result_pooled.estimation.n_parameters chi2_stat = -2.0 * (ll_pooled - (ll_g1 + ll_g2)) p_value = float(1.0 - chi2.cdf(chi2_stat, k)) return SwaitLouviereResult( chi2_stat=chi2_stat, df=k, p_value=p_value, ll_pooled=ll_pooled, ll_group1=ll_g1, ll_group2=ll_g2, n_parameters=k, group1_label=group1_label, group2_label=group2_label, reject_null=p_value < 0.05, )