| from __future__ import annotations |
|
|
| import typing |
|
|
| import numpy as np |
| from scipy.stats import norm, t |
|
|
| from numerize.numerize import numerize as nmr |
|
|
| if typing.TYPE_CHECKING: |
| from typing import Union |
|
|
| categories = ( |
| ("2 weeks", 14), |
| ("4 weeks", 28), |
| ("6 weeks", 42), |
| ) |
|
|
|
|
| def check_days_and_humanise(days, tolerance=0.1): |
| for c in categories: |
| if days <= c[1] + 0.1 * c[1]: |
| return c[0] |
|
|
| return "We recommend redefining the test, since the current requirement for obtaining statistical significance spans a time frame that is excessively long!" |
|
|
|
|
| def calculate_sample_size_rev( |
| original_mean, |
| std: float, |
| mde: float, |
| significance_level: float = 5, |
| power: float = 80, |
| sample_per_day=None, |
| ): |
|
|
| try: |
| samples = get_sample_size( |
| original_mean, std, mde / 100, significance_level / 100, power / 100 |
| ) |
| except ValueError as e: |
| return ( |
| f"input caused a calculation error {e}, values provided were original_mean-{original_mean}, std-{std}, mde-{mde}, significance_level-{significance_level}, power-{power}, samples_per_day-{sample_per_day}", |
| "", |
| "", |
| ) |
|
|
| days = ( |
| "You didn't provide an expected number of sample per day" |
| if sample_per_day is None |
| else max(int(np.round(samples / sample_per_day)), 1) |
| ) |
|
|
| outcome = check_days_and_humanise(days) if isinstance(days, int) else "..." |
|
|
| return nmr(samples), nmr(days), outcome |
|
|
|
|
| def calculate_sample_size_cr( |
| original_mean, |
| mde: float, |
| significance_level: float = 5, |
| power: float = 80, |
| sample_per_day=None, |
| ): |
| original_mean = original_mean / 100 |
| mde = mde / 100 |
| std = np.sqrt(original_mean * (1 - original_mean)) |
|
|
| if original_mean * (1 + mde) > 1: |
| return ( |
| f"It is not possible to compute the sample size since you passed" |
| f" an unfeasible combination of values of MDE {mde} and Conversion Rate {original_mean}." |
| f"This combination of values require to measure the probability of converting more than 100% which doesn't make sense!", |
| "", |
| "", |
| ) |
|
|
| alternative_std = np.sqrt(original_mean * (1 + mde) * (1 - original_mean * (1 + mde))) |
| try: |
| samples = get_sample_size( |
| original_mean, |
| std, |
| mde, |
| significance_level / 100, |
| power / 100, |
| alternative_std=alternative_std, |
| ) |
| except ValueError as e: |
| return ( |
| f"input caused a calculation error {e}, values provided were original_mean-{original_mean}, mde-{mde}, significance_level-{significance_level}, power-{power}, samples_per_day-{sample_per_day}", |
| "", |
| "", |
| ) |
|
|
| days = ( |
| "You didn't provide an expected number of sample per day" |
| if sample_per_day is None |
| else max(int(np.round(samples / sample_per_day)), 1) |
| ) |
|
|
| outcome = check_days_and_humanise(days) if isinstance(days, int) else "..." |
|
|
| return nmr(samples), nmr(days), outcome |
|
|
|
|
| def get_sample_size( |
| original_mean, |
| std: float, |
| mde: float, |
| alpha: float = 0.05, |
| power: float = 0.8, |
| alternative_std=None, |
| ): |
| if alternative_std is None: |
| alternative_std = std |
|
|
| mde_abs = mde * original_mean |
| factor = (2 * std) * norm.ppf(1 - alpha) + (std + alternative_std) * norm.ppf(power) |
| return max(round((factor / mde_abs) ** 2), 1) |
|
|
|
|
| def get_iterative_sample_size( |
| std: float, |
| mde_abs: float, |
| alpha: float = 0.05, |
| power: float = 0.8, |
| starting_sample_size=1, |
| go_to_convergence=True, |
| ): |
| sample_size_est = _approx_sample_size_with_t_dist( |
| std, mde_abs, alpha, power, starting_sample_size |
| ) |
|
|
| if go_to_convergence: |
| sample_size_est_upd = _approx_sample_size_with_t_dist( |
| std, mde_abs, alpha, power, sample_size_est |
| ) |
|
|
| if abs(np.round(sample_size_est_upd) - np.round(sample_size_est)) >= 1: |
| return get_iterative_sample_size( |
| std, mde_abs, alpha, power, sample_size_est_upd, go_to_convergence |
| ) |
| return np.round(sample_size_est_upd) |
| else: |
| return np.round(sample_size_est) |
|
|
|
|
| def _approx_sample_size_with_t_dist( |
| std: float, |
| mde_abs: float, |
| alpha: float, |
| power: float, |
| df: Union[int, float], |
| one_tailed=True, |
| ): |
| if not one_tailed: |
| alpha = alpha / 2 |
|
|
| factor = t.ppf(1 - alpha, df) + t.ppf(power, df) |
| return (factor * (std / mde_abs)) ** 2 + 1 |
|
|