File size: 8,626 Bytes
4256820
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
"""
Frequentist A/B testing: z-test, t-test, power analysis, FDR correction.

All functions return typed dataclasses for easy serialisation.
"""
from __future__ import annotations

import numpy as np
from dataclasses import dataclass, asdict
from typing import List, Tuple

from scipy import stats


# ── Result container ──────────────────────────────────────────────────────────

@dataclass
class TestResult:
    test_name: str
    statistic: float
    p_value: float
    ci_lower: float          # lower bound of CI / credible interval for diff
    ci_upper: float
    observed_diff: float     # point estimate of (B – A)
    relative_lift: float     # (B – A) / A  [%]
    effect_size: float
    effect_name: str
    significant: bool
    alpha: float

    def to_dict(self) -> dict:
        return asdict(self)


# ── Two-proportion Z-test ─────────────────────────────────────────────────────

def two_proportion_ztest(
    n_a: int, conv_a: int,
    n_b: int, conv_b: int,
    alpha: float = 0.05,
    two_tailed: bool = True,
) -> TestResult:
    """
    Z-test for difference in conversion rates.

    Uses a pooled standard error under H0 (standard frequentist approach) and
    unpooled SE for the confidence interval (correct coverage semantics).
    """
    p_a = conv_a / n_a
    p_b = conv_b / n_b
    p_pool = (conv_a + conv_b) / (n_a + n_b)

    # Pooled SE for the test statistic
    se_test = np.sqrt(p_pool * (1 - p_pool) * (1 / n_a + 1 / n_b))
    if se_test == 0:
        se_test = 1e-12

    z = (p_b - p_a) / se_test
    p_value = 2 * (1 - stats.norm.cdf(abs(z))) if two_tailed else (1 - stats.norm.cdf(z))

    # Unpooled SE for the CI
    se_ci = np.sqrt(p_a * (1 - p_a) / n_a + p_b * (1 - p_b) / n_b)
    z_crit = stats.norm.ppf(1 - alpha / 2)
    diff = p_b - p_a
    ci = (diff - z_crit * se_ci, diff + z_crit * se_ci)

    # Cohen's h effect size
    cohen_h = 2 * np.arcsin(np.sqrt(p_b)) - 2 * np.arcsin(np.sqrt(p_a))

    rel_lift = diff / p_a * 100 if p_a > 0 else 0.0

    return TestResult(
        test_name="Two-proportion Z-test",
        statistic=round(z, 4),
        p_value=round(p_value, 6),
        ci_lower=round(ci[0], 6),
        ci_upper=round(ci[1], 6),
        observed_diff=round(diff, 6),
        relative_lift=round(rel_lift, 2),
        effect_size=round(cohen_h, 4),
        effect_name="Cohen's h",
        significant=bool(p_value < alpha),
        alpha=alpha,
    )


# ── Two-sample t-test ─────────────────────────────────────────────────────────

def two_sample_ttest(
    mean_a: float, std_a: float, n_a: int,
    mean_b: float, std_b: float, n_b: int,
    alpha: float = 0.05,
    equal_var: bool = False,
) -> TestResult:
    """
    Welch's t-test for difference in means (e.g., revenue per user).
    equal_var=False uses Welch's approximation; equal_var=True uses Student's.
    """
    t, p_value = stats.ttest_ind_from_stats(
        mean_a, std_a, n_a, mean_b, std_b, n_b, equal_var=equal_var
    )

    # Cohen's d (pooled SD denominator)
    pooled_std = np.sqrt((std_a ** 2 + std_b ** 2) / 2)
    cohens_d = (mean_b - mean_a) / pooled_std if pooled_std > 0 else 0.0

    # CI for the difference (Welch approximation)
    diff = mean_b - mean_a
    se = np.sqrt(std_a ** 2 / n_a + std_b ** 2 / n_b)
    # Welch–Satterthwaite df
    df_num = (std_a ** 2 / n_a + std_b ** 2 / n_b) ** 2
    df_den = (std_a ** 2 / n_a) ** 2 / (n_a - 1) + (std_b ** 2 / n_b) ** 2 / (n_b - 1)
    df = df_num / df_den if df_den > 0 else n_a + n_b - 2
    t_crit = stats.t.ppf(1 - alpha / 2, df)
    ci = (diff - t_crit * se, diff + t_crit * se)

    rel_lift = diff / mean_a * 100 if mean_a != 0 else 0.0

    return TestResult(
        test_name="Welch's t-test",
        statistic=round(float(t), 4),
        p_value=round(float(p_value), 6),
        ci_lower=round(ci[0], 4),
        ci_upper=round(ci[1], 4),
        observed_diff=round(diff, 4),
        relative_lift=round(rel_lift, 2),
        effect_size=round(cohens_d, 4),
        effect_name="Cohen's d",
        significant=bool(p_value < alpha),
        alpha=alpha,
    )


# ── Power analysis ────────────────────────────────────────────────────────────

def compute_power(
    n_per_group: int,
    baseline_rate: float,
    mde: float,
    alpha: float = 0.05,
    two_tailed: bool = True,
) -> float:
    """
    Statistical power for a two-proportion z-test.

    Power = P(reject H0 | H1 is true).
    """
    p1 = baseline_rate
    p2 = baseline_rate + mde
    p_avg = (p1 + p2) / 2

    se = np.sqrt(2 * p_avg * (1 - p_avg) / n_per_group)
    if se == 0:
        return 0.0

    z_alpha = stats.norm.ppf(1 - alpha / (2 if two_tailed else 1))
    delta = abs(p2 - p1)
    z = delta / se - z_alpha
    return float(stats.norm.cdf(z))


def required_sample_size(
    baseline_rate: float,
    mde: float,
    alpha: float = 0.05,
    power: float = 0.80,
    two_tailed: bool = True,
) -> int:
    """
    Minimum sample size per group for a two-proportion z-test.

    Uses the exact formula rather than binary search for speed.
    """
    p1 = baseline_rate
    p2 = baseline_rate + mde
    p_avg = (p1 + p2) / 2

    z_alpha = stats.norm.ppf(1 - alpha / (2 if two_tailed else 1))
    z_beta  = stats.norm.ppf(power)

    numerator = (
        z_alpha * np.sqrt(2 * p_avg * (1 - p_avg))
        + z_beta * np.sqrt(p1 * (1 - p1) + p2 * (1 - p2))
    ) ** 2
    denominator = (p2 - p1) ** 2

    return int(np.ceil(numerator / denominator))


def power_curve(
    baseline_rate: float,
    mde: float,
    alpha: float = 0.05,
    n_max_multiplier: float = 3.0,
) -> Tuple[List[int], List[float]]:
    """
    Compute power vs sample size for plotting.

    Returns (sample_sizes, powers).
    """
    n_req = required_sample_size(baseline_rate, mde, alpha, power=0.80)
    n_max = max(int(n_req * n_max_multiplier), 500)
    ns = list(range(50, n_max, max(1, n_max // 200)))
    powers = [compute_power(n, baseline_rate, mde, alpha) for n in ns]
    return ns, powers


# ── Multiple testing correction ───────────────────────────────────────────────

def fdr_correction(
    p_values: List[float],
    alpha: float = 0.05,
) -> Tuple[List[float], List[bool]]:
    """
    Benjamini–Hochberg FDR correction for multiple simultaneous tests.

    Returns (adjusted_p_values, significance_flags).
    """
    n = len(p_values)
    order = np.argsort(p_values)
    sorted_p = np.array(p_values)[order]

    # BH step-up procedure
    adjusted = sorted_p * n / np.arange(1, n + 1)
    # Enforce monotonicity (right to left minimum)
    adjusted = np.minimum.accumulate(adjusted[::-1])[::-1]
    adjusted = np.minimum(adjusted, 1.0)

    # Map back to original order
    result = np.empty(n)
    result[order] = adjusted

    return list(result), [bool(v < alpha) for v in result]


# ── Chi-square test of independence ──────────────────────────────────────────

def chi_square_test(
    n_a: int, conv_a: int,
    n_b: int, conv_b: int,
    alpha: float = 0.05,
) -> TestResult:
    """
    Chi-square test of independence for a 2Γ—2 contingency table.
    Equivalent to the z-test for proportions (zΒ² = χ²) but more familiar
    to some practitioners.
    """
    table = np.array([
        [conv_a, n_a - conv_a],
        [conv_b, n_b - conv_b],
    ])
    chi2, p_value, _, _ = stats.chi2_contingency(table, correction=False)

    p_a = conv_a / n_a
    p_b = conv_b / n_b
    diff = p_b - p_a

    # CramΓ©r's V effect size
    n_total = n_a + n_b
    cramers_v = np.sqrt(chi2 / n_total)

    return TestResult(
        test_name="Chi-square test",
        statistic=round(chi2, 4),
        p_value=round(p_value, 6),
        ci_lower=float("nan"),
        ci_upper=float("nan"),
        observed_diff=round(diff, 6),
        relative_lift=round(diff / p_a * 100 if p_a > 0 else 0.0, 2),
        effect_size=round(cramers_v, 4),
        effect_name="CramΓ©r's V",
        significant=bool(p_value < alpha),
        alpha=alpha,
    )