Spaces:
Running
Running
| """ | |
| Frequentist A/B testing: z-test, t-test, power analysis, FDR correction. | |
| All functions return typed dataclasses for easy serialisation. | |
| """ | |
| from __future__ import annotations | |
| import numpy as np | |
| from dataclasses import dataclass, asdict | |
| from typing import List, Tuple | |
| from scipy import stats | |
| # ββ Result container ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| class TestResult: | |
| test_name: str | |
| statistic: float | |
| p_value: float | |
| ci_lower: float # lower bound of CI / credible interval for diff | |
| ci_upper: float | |
| observed_diff: float # point estimate of (B β A) | |
| relative_lift: float # (B β A) / A [%] | |
| effect_size: float | |
| effect_name: str | |
| significant: bool | |
| alpha: float | |
| def to_dict(self) -> dict: | |
| return asdict(self) | |
| # ββ Two-proportion Z-test βββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def two_proportion_ztest( | |
| n_a: int, conv_a: int, | |
| n_b: int, conv_b: int, | |
| alpha: float = 0.05, | |
| two_tailed: bool = True, | |
| ) -> TestResult: | |
| """ | |
| Z-test for difference in conversion rates. | |
| Uses a pooled standard error under H0 (standard frequentist approach) and | |
| unpooled SE for the confidence interval (correct coverage semantics). | |
| """ | |
| p_a = conv_a / n_a | |
| p_b = conv_b / n_b | |
| p_pool = (conv_a + conv_b) / (n_a + n_b) | |
| # Pooled SE for the test statistic | |
| se_test = np.sqrt(p_pool * (1 - p_pool) * (1 / n_a + 1 / n_b)) | |
| if se_test == 0: | |
| se_test = 1e-12 | |
| z = (p_b - p_a) / se_test | |
| p_value = 2 * (1 - stats.norm.cdf(abs(z))) if two_tailed else (1 - stats.norm.cdf(z)) | |
| # Unpooled SE for the CI | |
| se_ci = np.sqrt(p_a * (1 - p_a) / n_a + p_b * (1 - p_b) / n_b) | |
| z_crit = stats.norm.ppf(1 - alpha / 2) | |
| diff = p_b - p_a | |
| ci = (diff - z_crit * se_ci, diff + z_crit * se_ci) | |
| # Cohen's h effect size | |
| cohen_h = 2 * np.arcsin(np.sqrt(p_b)) - 2 * np.arcsin(np.sqrt(p_a)) | |
| rel_lift = diff / p_a * 100 if p_a > 0 else 0.0 | |
| return TestResult( | |
| test_name="Two-proportion Z-test", | |
| statistic=round(z, 4), | |
| p_value=round(p_value, 6), | |
| ci_lower=round(ci[0], 6), | |
| ci_upper=round(ci[1], 6), | |
| observed_diff=round(diff, 6), | |
| relative_lift=round(rel_lift, 2), | |
| effect_size=round(cohen_h, 4), | |
| effect_name="Cohen's h", | |
| significant=bool(p_value < alpha), | |
| alpha=alpha, | |
| ) | |
| # ββ Two-sample t-test βββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def two_sample_ttest( | |
| mean_a: float, std_a: float, n_a: int, | |
| mean_b: float, std_b: float, n_b: int, | |
| alpha: float = 0.05, | |
| equal_var: bool = False, | |
| ) -> TestResult: | |
| """ | |
| Welch's t-test for difference in means (e.g., revenue per user). | |
| equal_var=False uses Welch's approximation; equal_var=True uses Student's. | |
| """ | |
| t, p_value = stats.ttest_ind_from_stats( | |
| mean_a, std_a, n_a, mean_b, std_b, n_b, equal_var=equal_var | |
| ) | |
| # Cohen's d (pooled SD denominator) | |
| pooled_std = np.sqrt((std_a ** 2 + std_b ** 2) / 2) | |
| cohens_d = (mean_b - mean_a) / pooled_std if pooled_std > 0 else 0.0 | |
| # CI for the difference (Welch approximation) | |
| diff = mean_b - mean_a | |
| se = np.sqrt(std_a ** 2 / n_a + std_b ** 2 / n_b) | |
| # WelchβSatterthwaite df | |
| df_num = (std_a ** 2 / n_a + std_b ** 2 / n_b) ** 2 | |
| df_den = (std_a ** 2 / n_a) ** 2 / (n_a - 1) + (std_b ** 2 / n_b) ** 2 / (n_b - 1) | |
| df = df_num / df_den if df_den > 0 else n_a + n_b - 2 | |
| t_crit = stats.t.ppf(1 - alpha / 2, df) | |
| ci = (diff - t_crit * se, diff + t_crit * se) | |
| rel_lift = diff / mean_a * 100 if mean_a != 0 else 0.0 | |
| return TestResult( | |
| test_name="Welch's t-test", | |
| statistic=round(float(t), 4), | |
| p_value=round(float(p_value), 6), | |
| ci_lower=round(ci[0], 4), | |
| ci_upper=round(ci[1], 4), | |
| observed_diff=round(diff, 4), | |
| relative_lift=round(rel_lift, 2), | |
| effect_size=round(cohens_d, 4), | |
| effect_name="Cohen's d", | |
| significant=bool(p_value < alpha), | |
| alpha=alpha, | |
| ) | |
| # ββ Power analysis ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def compute_power( | |
| n_per_group: int, | |
| baseline_rate: float, | |
| mde: float, | |
| alpha: float = 0.05, | |
| two_tailed: bool = True, | |
| ) -> float: | |
| """ | |
| Statistical power for a two-proportion z-test. | |
| Power = P(reject H0 | H1 is true). | |
| """ | |
| p1 = baseline_rate | |
| p2 = baseline_rate + mde | |
| p_avg = (p1 + p2) / 2 | |
| se = np.sqrt(2 * p_avg * (1 - p_avg) / n_per_group) | |
| if se == 0: | |
| return 0.0 | |
| z_alpha = stats.norm.ppf(1 - alpha / (2 if two_tailed else 1)) | |
| delta = abs(p2 - p1) | |
| z = delta / se - z_alpha | |
| return float(stats.norm.cdf(z)) | |
| def required_sample_size( | |
| baseline_rate: float, | |
| mde: float, | |
| alpha: float = 0.05, | |
| power: float = 0.80, | |
| two_tailed: bool = True, | |
| ) -> int: | |
| """ | |
| Minimum sample size per group for a two-proportion z-test. | |
| Uses the exact formula rather than binary search for speed. | |
| """ | |
| p1 = baseline_rate | |
| p2 = baseline_rate + mde | |
| p_avg = (p1 + p2) / 2 | |
| z_alpha = stats.norm.ppf(1 - alpha / (2 if two_tailed else 1)) | |
| z_beta = stats.norm.ppf(power) | |
| numerator = ( | |
| z_alpha * np.sqrt(2 * p_avg * (1 - p_avg)) | |
| + z_beta * np.sqrt(p1 * (1 - p1) + p2 * (1 - p2)) | |
| ) ** 2 | |
| denominator = (p2 - p1) ** 2 | |
| return int(np.ceil(numerator / denominator)) | |
| def power_curve( | |
| baseline_rate: float, | |
| mde: float, | |
| alpha: float = 0.05, | |
| n_max_multiplier: float = 3.0, | |
| ) -> Tuple[List[int], List[float]]: | |
| """ | |
| Compute power vs sample size for plotting. | |
| Returns (sample_sizes, powers). | |
| """ | |
| n_req = required_sample_size(baseline_rate, mde, alpha, power=0.80) | |
| n_max = max(int(n_req * n_max_multiplier), 500) | |
| ns = list(range(50, n_max, max(1, n_max // 200))) | |
| powers = [compute_power(n, baseline_rate, mde, alpha) for n in ns] | |
| return ns, powers | |
| # ββ Multiple testing correction βββββββββββββββββββββββββββββββββββββββββββββββ | |
| def fdr_correction( | |
| p_values: List[float], | |
| alpha: float = 0.05, | |
| ) -> Tuple[List[float], List[bool]]: | |
| """ | |
| BenjaminiβHochberg FDR correction for multiple simultaneous tests. | |
| Returns (adjusted_p_values, significance_flags). | |
| """ | |
| n = len(p_values) | |
| order = np.argsort(p_values) | |
| sorted_p = np.array(p_values)[order] | |
| # BH step-up procedure | |
| adjusted = sorted_p * n / np.arange(1, n + 1) | |
| # Enforce monotonicity (right to left minimum) | |
| adjusted = np.minimum.accumulate(adjusted[::-1])[::-1] | |
| adjusted = np.minimum(adjusted, 1.0) | |
| # Map back to original order | |
| result = np.empty(n) | |
| result[order] = adjusted | |
| return list(result), [bool(v < alpha) for v in result] | |
| # ββ Chi-square test of independence ββββββββββββββββββββββββββββββββββββββββββ | |
| def chi_square_test( | |
| n_a: int, conv_a: int, | |
| n_b: int, conv_b: int, | |
| alpha: float = 0.05, | |
| ) -> TestResult: | |
| """ | |
| Chi-square test of independence for a 2Γ2 contingency table. | |
| Equivalent to the z-test for proportions (zΒ² = ΟΒ²) but more familiar | |
| to some practitioners. | |
| """ | |
| table = np.array([ | |
| [conv_a, n_a - conv_a], | |
| [conv_b, n_b - conv_b], | |
| ]) | |
| chi2, p_value, _, _ = stats.chi2_contingency(table, correction=False) | |
| p_a = conv_a / n_a | |
| p_b = conv_b / n_b | |
| diff = p_b - p_a | |
| # CramΓ©r's V effect size | |
| n_total = n_a + n_b | |
| cramers_v = np.sqrt(chi2 / n_total) | |
| return TestResult( | |
| test_name="Chi-square test", | |
| statistic=round(chi2, 4), | |
| p_value=round(p_value, 6), | |
| ci_lower=float("nan"), | |
| ci_upper=float("nan"), | |
| observed_diff=round(diff, 6), | |
| relative_lift=round(diff / p_a * 100 if p_a > 0 else 0.0, 2), | |
| effect_size=round(cramers_v, 4), | |
| effect_name="CramΓ©r's V", | |
| significant=bool(p_value < alpha), | |
| alpha=alpha, | |
| ) | |