frequentist_sample_size / sample_size_calculator.py
lciampiconi-lm's picture
min samples per day -> 1
4746a1c
from __future__ import annotations
import typing
import numpy as np
from scipy.stats import norm, t
from numerize.numerize import numerize as nmr
if typing.TYPE_CHECKING:
from typing import Union
categories = (
("2 weeks", 14),
("4 weeks", 28),
("6 weeks", 42),
)
def check_days_and_humanise(days, tolerance=0.1):
for c in categories:
if days <= c[1] + 0.1 * c[1]:
return c[0]
return "We recommend redefining the test, since the current requirement for obtaining statistical significance spans a time frame that is excessively long!"
def calculate_sample_size_rev(
original_mean,
std: float,
mde: float,
significance_level: float = 5,
power: float = 80,
sample_per_day=None,
):
try:
samples = get_sample_size(
original_mean, std, mde / 100, significance_level / 100, power / 100
)
except ValueError as e:
return (
f"input caused a calculation error {e}, values provided were original_mean-{original_mean}, std-{std}, mde-{mde}, significance_level-{significance_level}, power-{power}, samples_per_day-{sample_per_day}",
"",
"",
)
days = (
"You didn't provide an expected number of sample per day"
if sample_per_day is None
else max(int(np.round(samples / sample_per_day)), 1)
)
outcome = check_days_and_humanise(days) if isinstance(days, int) else "..."
return nmr(samples), nmr(days), outcome
def calculate_sample_size_cr(
original_mean,
mde: float,
significance_level: float = 5,
power: float = 80,
sample_per_day=None,
):
original_mean = original_mean / 100
mde = mde / 100
std = np.sqrt(original_mean * (1 - original_mean))
if original_mean * (1 + mde) > 1:
return (
f"It is not possible to compute the sample size since you passed"
f" an unfeasible combination of values of MDE {mde} and Conversion Rate {original_mean}."
f"This combination of values require to measure the probability of converting more than 100% which doesn't make sense!",
"",
"",
)
alternative_std = np.sqrt(original_mean * (1 + mde) * (1 - original_mean * (1 + mde)))
try:
samples = get_sample_size(
original_mean,
std,
mde,
significance_level / 100,
power / 100,
alternative_std=alternative_std,
)
except ValueError as e:
return (
f"input caused a calculation error {e}, values provided were original_mean-{original_mean}, mde-{mde}, significance_level-{significance_level}, power-{power}, samples_per_day-{sample_per_day}",
"",
"",
)
days = (
"You didn't provide an expected number of sample per day"
if sample_per_day is None
else max(int(np.round(samples / sample_per_day)), 1)
)
outcome = check_days_and_humanise(days) if isinstance(days, int) else "..."
return nmr(samples), nmr(days), outcome
def get_sample_size(
original_mean,
std: float,
mde: float,
alpha: float = 0.05,
power: float = 0.8,
alternative_std=None,
):
if alternative_std is None:
alternative_std = std
mde_abs = mde * original_mean
factor = (2 * std) * norm.ppf(1 - alpha) + (std + alternative_std) * norm.ppf(power)
return max(round((factor / mde_abs) ** 2), 1)
def get_iterative_sample_size(
std: float,
mde_abs: float,
alpha: float = 0.05,
power: float = 0.8,
starting_sample_size=1,
go_to_convergence=True,
):
sample_size_est = _approx_sample_size_with_t_dist(
std, mde_abs, alpha, power, starting_sample_size
)
if go_to_convergence:
sample_size_est_upd = _approx_sample_size_with_t_dist(
std, mde_abs, alpha, power, sample_size_est
)
if abs(np.round(sample_size_est_upd) - np.round(sample_size_est)) >= 1:
return get_iterative_sample_size(
std, mde_abs, alpha, power, sample_size_est_upd, go_to_convergence
)
return np.round(sample_size_est_upd)
else:
return np.round(sample_size_est)
def _approx_sample_size_with_t_dist(
std: float,
mde_abs: float,
alpha: float,
power: float,
df: Union[int, float],
one_tailed=True,
):
if not one_tailed:
alpha = alpha / 2
factor = t.ppf(1 - alpha, df) + t.ppf(power, df)
return (factor * (std / mde_abs)) ** 2 + 1