ab-testing-causal / src /frequentist.py
fikri0o0's picture
2026-06-04: Initial deployment β€” A/B Testing & Causal Inference Simulator
4256820
"""
Frequentist A/B testing: z-test, t-test, power analysis, FDR correction.
All functions return typed dataclasses for easy serialisation.
"""
from __future__ import annotations
import numpy as np
from dataclasses import dataclass, asdict
from typing import List, Tuple
from scipy import stats
# ── Result container ──────────────────────────────────────────────────────────
@dataclass
class TestResult:
test_name: str
statistic: float
p_value: float
ci_lower: float # lower bound of CI / credible interval for diff
ci_upper: float
observed_diff: float # point estimate of (B – A)
relative_lift: float # (B – A) / A [%]
effect_size: float
effect_name: str
significant: bool
alpha: float
def to_dict(self) -> dict:
return asdict(self)
# ── Two-proportion Z-test ─────────────────────────────────────────────────────
def two_proportion_ztest(
n_a: int, conv_a: int,
n_b: int, conv_b: int,
alpha: float = 0.05,
two_tailed: bool = True,
) -> TestResult:
"""
Z-test for difference in conversion rates.
Uses a pooled standard error under H0 (standard frequentist approach) and
unpooled SE for the confidence interval (correct coverage semantics).
"""
p_a = conv_a / n_a
p_b = conv_b / n_b
p_pool = (conv_a + conv_b) / (n_a + n_b)
# Pooled SE for the test statistic
se_test = np.sqrt(p_pool * (1 - p_pool) * (1 / n_a + 1 / n_b))
if se_test == 0:
se_test = 1e-12
z = (p_b - p_a) / se_test
p_value = 2 * (1 - stats.norm.cdf(abs(z))) if two_tailed else (1 - stats.norm.cdf(z))
# Unpooled SE for the CI
se_ci = np.sqrt(p_a * (1 - p_a) / n_a + p_b * (1 - p_b) / n_b)
z_crit = stats.norm.ppf(1 - alpha / 2)
diff = p_b - p_a
ci = (diff - z_crit * se_ci, diff + z_crit * se_ci)
# Cohen's h effect size
cohen_h = 2 * np.arcsin(np.sqrt(p_b)) - 2 * np.arcsin(np.sqrt(p_a))
rel_lift = diff / p_a * 100 if p_a > 0 else 0.0
return TestResult(
test_name="Two-proportion Z-test",
statistic=round(z, 4),
p_value=round(p_value, 6),
ci_lower=round(ci[0], 6),
ci_upper=round(ci[1], 6),
observed_diff=round(diff, 6),
relative_lift=round(rel_lift, 2),
effect_size=round(cohen_h, 4),
effect_name="Cohen's h",
significant=bool(p_value < alpha),
alpha=alpha,
)
# ── Two-sample t-test ─────────────────────────────────────────────────────────
def two_sample_ttest(
mean_a: float, std_a: float, n_a: int,
mean_b: float, std_b: float, n_b: int,
alpha: float = 0.05,
equal_var: bool = False,
) -> TestResult:
"""
Welch's t-test for difference in means (e.g., revenue per user).
equal_var=False uses Welch's approximation; equal_var=True uses Student's.
"""
t, p_value = stats.ttest_ind_from_stats(
mean_a, std_a, n_a, mean_b, std_b, n_b, equal_var=equal_var
)
# Cohen's d (pooled SD denominator)
pooled_std = np.sqrt((std_a ** 2 + std_b ** 2) / 2)
cohens_d = (mean_b - mean_a) / pooled_std if pooled_std > 0 else 0.0
# CI for the difference (Welch approximation)
diff = mean_b - mean_a
se = np.sqrt(std_a ** 2 / n_a + std_b ** 2 / n_b)
# Welch–Satterthwaite df
df_num = (std_a ** 2 / n_a + std_b ** 2 / n_b) ** 2
df_den = (std_a ** 2 / n_a) ** 2 / (n_a - 1) + (std_b ** 2 / n_b) ** 2 / (n_b - 1)
df = df_num / df_den if df_den > 0 else n_a + n_b - 2
t_crit = stats.t.ppf(1 - alpha / 2, df)
ci = (diff - t_crit * se, diff + t_crit * se)
rel_lift = diff / mean_a * 100 if mean_a != 0 else 0.0
return TestResult(
test_name="Welch's t-test",
statistic=round(float(t), 4),
p_value=round(float(p_value), 6),
ci_lower=round(ci[0], 4),
ci_upper=round(ci[1], 4),
observed_diff=round(diff, 4),
relative_lift=round(rel_lift, 2),
effect_size=round(cohens_d, 4),
effect_name="Cohen's d",
significant=bool(p_value < alpha),
alpha=alpha,
)
# ── Power analysis ────────────────────────────────────────────────────────────
def compute_power(
n_per_group: int,
baseline_rate: float,
mde: float,
alpha: float = 0.05,
two_tailed: bool = True,
) -> float:
"""
Statistical power for a two-proportion z-test.
Power = P(reject H0 | H1 is true).
"""
p1 = baseline_rate
p2 = baseline_rate + mde
p_avg = (p1 + p2) / 2
se = np.sqrt(2 * p_avg * (1 - p_avg) / n_per_group)
if se == 0:
return 0.0
z_alpha = stats.norm.ppf(1 - alpha / (2 if two_tailed else 1))
delta = abs(p2 - p1)
z = delta / se - z_alpha
return float(stats.norm.cdf(z))
def required_sample_size(
baseline_rate: float,
mde: float,
alpha: float = 0.05,
power: float = 0.80,
two_tailed: bool = True,
) -> int:
"""
Minimum sample size per group for a two-proportion z-test.
Uses the exact formula rather than binary search for speed.
"""
p1 = baseline_rate
p2 = baseline_rate + mde
p_avg = (p1 + p2) / 2
z_alpha = stats.norm.ppf(1 - alpha / (2 if two_tailed else 1))
z_beta = stats.norm.ppf(power)
numerator = (
z_alpha * np.sqrt(2 * p_avg * (1 - p_avg))
+ z_beta * np.sqrt(p1 * (1 - p1) + p2 * (1 - p2))
) ** 2
denominator = (p2 - p1) ** 2
return int(np.ceil(numerator / denominator))
def power_curve(
baseline_rate: float,
mde: float,
alpha: float = 0.05,
n_max_multiplier: float = 3.0,
) -> Tuple[List[int], List[float]]:
"""
Compute power vs sample size for plotting.
Returns (sample_sizes, powers).
"""
n_req = required_sample_size(baseline_rate, mde, alpha, power=0.80)
n_max = max(int(n_req * n_max_multiplier), 500)
ns = list(range(50, n_max, max(1, n_max // 200)))
powers = [compute_power(n, baseline_rate, mde, alpha) for n in ns]
return ns, powers
# ── Multiple testing correction ───────────────────────────────────────────────
def fdr_correction(
p_values: List[float],
alpha: float = 0.05,
) -> Tuple[List[float], List[bool]]:
"""
Benjamini–Hochberg FDR correction for multiple simultaneous tests.
Returns (adjusted_p_values, significance_flags).
"""
n = len(p_values)
order = np.argsort(p_values)
sorted_p = np.array(p_values)[order]
# BH step-up procedure
adjusted = sorted_p * n / np.arange(1, n + 1)
# Enforce monotonicity (right to left minimum)
adjusted = np.minimum.accumulate(adjusted[::-1])[::-1]
adjusted = np.minimum(adjusted, 1.0)
# Map back to original order
result = np.empty(n)
result[order] = adjusted
return list(result), [bool(v < alpha) for v in result]
# ── Chi-square test of independence ──────────────────────────────────────────
def chi_square_test(
n_a: int, conv_a: int,
n_b: int, conv_b: int,
alpha: float = 0.05,
) -> TestResult:
"""
Chi-square test of independence for a 2Γ—2 contingency table.
Equivalent to the z-test for proportions (zΒ² = χ²) but more familiar
to some practitioners.
"""
table = np.array([
[conv_a, n_a - conv_a],
[conv_b, n_b - conv_b],
])
chi2, p_value, _, _ = stats.chi2_contingency(table, correction=False)
p_a = conv_a / n_a
p_b = conv_b / n_b
diff = p_b - p_a
# CramΓ©r's V effect size
n_total = n_a + n_b
cramers_v = np.sqrt(chi2 / n_total)
return TestResult(
test_name="Chi-square test",
statistic=round(chi2, 4),
p_value=round(p_value, 6),
ci_lower=float("nan"),
ci_upper=float("nan"),
observed_diff=round(diff, 6),
relative_lift=round(diff / p_a * 100 if p_a > 0 else 0.0, 2),
effect_size=round(cramers_v, 4),
effect_name="CramΓ©r's V",
significant=bool(p_value < alpha),
alpha=alpha,
)