again / core /hypothesis_tests.py
Beam2513's picture
Upload 127 files
798602c verified
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import pingouin as pg
from scipy.stats import t, gaussian_kde, bartlett, levene
# ============================================================
# One-sample t-test
# ============================================================
def plot_ttest_mean_distribution(
numeric_col: str,
sample: np.ndarray,
mu0: float,
df_output: pd.DataFrame,
alternative: str,
bootstrap_samples: int,
):
"""
Plot bootstrap sampling distribution of the mean vs. theoretical t under H0,
shading the p-value region.
"""
plt.style.use("seaborn-v0_8-whitegrid")
# Pingouin output
p_val = df_output["p-val"].values[0]
df = df_output["dof"].values[0]
# Sample stats
n = len(sample)
sample_mean = np.mean(sample)
sample_std = np.std(sample, ddof=1)
se = sample_std / np.sqrt(n)
# Theoretical t-distribution under H0
x = np.linspace(mu0 - 5 * se, mu0 + 5 * se, 1000)
t_density = t.pdf((x - mu0) / se, df) / se
# Bootstrap sampling distribution of the mean
boot_means = np.array(
[
np.mean(np.random.choice(sample, size=n, replace=True))
for _ in range(bootstrap_samples)
]
)
# KDE for bootstrap means (off-screen fig to get line data)
fig_tmp, ax_tmp = plt.subplots()
sns.kdeplot(boot_means, bw_adjust=1.2, ax=ax_tmp)
x_kde, y_kde = ax_tmp.lines[0].get_data()
plt.close(fig_tmp)
# Final figure
fig, ax = plt.subplots(figsize=(8, 5))
# Bootstrap KDE
ax.plot(
x_kde,
y_kde,
color="rebeccapurple",
label="Bootstrap sampling dist.",
linewidth=2,
)
# Theoretical t under H0
ax.plot(
x,
t_density,
color="gray",
linestyle="--",
linewidth=2,
label=r"$t$-distribution ($H_0$)",
)
# Shade p-value region
if alternative == "two-sided":
delta = abs(sample_mean - mu0)
lower = mu0 - delta
upper = mu0 + delta
mask = (x <= lower) | (x >= upper)
elif alternative == "greater":
mask = x >= sample_mean
elif alternative == "less":
mask = x <= sample_mean
else:
raise ValueError("alternative must be 'two-sided', 'greater', or 'less'")
ax.fill_between(
x,
0,
t_density,
where=mask,
color="red",
alpha=0.3,
label=f"p-value ≈ {p_val:.3f}",
)
# Reference lines
ax.axvline(
mu0,
color="tab:orange",
linestyle="--",
linewidth=2,
label=rf"$\mu_0 = {mu0}$",
)
ax.axvline(
sample_mean,
color="black",
linestyle="-",
linewidth=1.5,
label=rf"Sample mean = {sample_mean:.2f}",
)
# Formatting
ax.set_title(
f"Sampling Distribution of the Mean ({numeric_col})",
fontsize=14,
)
ax.set_xlabel("Sample Mean", fontsize=12)
ax.set_ylabel("Density", fontsize=12)
ax.grid(True, linestyle="--", alpha=0.5)
ax.legend()
plt.tight_layout()
return fig
def one_sample_ttest(
sample: np.ndarray,
mu0: float,
alternative: str,
*,
numeric_col: str,
bootstrap_samples: int,
include_graph: bool,
) -> tuple[pd.DataFrame, plt.Figure | None]:
"""
One-sample Student's t-test and optional sampling distribution plot.
"""
if sample.size == 0:
raise ValueError("No valid data in the selected column.")
df_output = pg.ttest(
x=sample,
y=mu0,
alternative=alternative,
paired=False,
)
fig = None
if include_graph:
fig = plot_ttest_mean_distribution(
numeric_col=numeric_col,
sample=sample,
mu0=mu0,
df_output=df_output,
alternative=alternative,
bootstrap_samples=bootstrap_samples,
)
return df_output, fig
# ============================================================
# Two-sample t-test (means)
# ============================================================
def mirror_plot(
numeric_col: str,
group1: np.ndarray,
name_group1: str,
group2: np.ndarray,
name_group2: str,
df_output: pd.DataFrame,
):
"""
Mirror histogram + KDE plot for two groups.
"""
t_val = df_output["T"].values[0]
p_val = df_output["p-val"].values[0]
mean1 = np.mean(group1)
mean2 = np.mean(group2)
fig, ax = plt.subplots(figsize=(10, 6))
# Shared binning
combined = np.concatenate([group1, group2])
x_min, x_max = float(np.min(combined)), float(np.max(combined))
bin_range = np.linspace(x_min, x_max, 30)
bin_centers = (bin_range[:-1] + bin_range[1:]) / 2
bin_width = np.diff(bin_range)[0]
x_vals = np.linspace(x_min, x_max, 200)
# Group 1 (top)
sns.histplot(
group1,
bins=bin_range,
stat="density",
kde=False,
color="rebeccapurple",
label=name_group1,
alpha=0.6,
ax=ax,
)
kde1 = gaussian_kde(group1)
ax.plot(x_vals, kde1(x_vals), color="rebeccapurple", linewidth=2)
ax.axvline(
mean1,
color="rebeccapurple",
linestyle="--",
linewidth=2,
label=f"{name_group1} mean = {mean1:.2f}",
)
# Group 2 (bottom, mirrored)
heights2, _ = np.histogram(group2, bins=bin_range, density=True)
ax.bar(
bin_centers,
-heights2,
width=bin_width,
color="tab:orange",
edgecolor="black",
alpha=0.6,
label=name_group2,
)
kde2 = gaussian_kde(group2)
ax.plot(x_vals, -kde2(x_vals), color="tab:orange", linewidth=2)
ax.axvline(
mean2,
color="tab:orange",
linestyle="--",
linewidth=2,
label=f"{name_group2} mean = {mean2:.2f}",
)
ax.axhline(0, color="black", linewidth=1)
ax.set_title("Mirror Plot: Two-Sample Distribution Comparison", fontsize=14)
ax.set_xlabel(numeric_col)
ax.set_ylabel("Density (Top ↑ vs. Bottom ↓)", fontsize=11)
ax.text(
0.01,
0.95,
f"p = {p_val:.3f}",
transform=ax.transAxes,
fontsize=11,
verticalalignment="top",
bbox=dict(boxstyle="round", facecolor="white", alpha=0.6),
)
ax.legend()
plt.tight_layout()
return fig
def plot_mean_distribution(
group1: np.ndarray,
name_group1: str,
group2: np.ndarray,
name_group2: str,
bootstrap_samples: int,
df_output: pd.DataFrame,
):
"""
Bootstrap distributions of the two sample means.
"""
p_val = df_output["p-val"].values[0]
mean1 = np.mean(group1)
mean2 = np.mean(group2)
boot1 = [
np.mean(np.random.choice(group1, size=len(group1), replace=True))
for _ in range(bootstrap_samples)
]
boot2 = [
np.mean(np.random.choice(group2, size=len(group2), replace=True))
for _ in range(bootstrap_samples)
]
fig, ax = plt.subplots(figsize=(8, 5))
sns.kdeplot(
boot1,
label=f"{name_group1} mean",
fill=True,
color="rebeccapurple",
alpha=0.6,
ax=ax,
)
sns.kdeplot(
boot2,
label=f"{name_group2} mean",
fill=True,
color="tab:orange",
alpha=0.6,
ax=ax,
)
ax.axvline(mean1, color="rebeccapurple", linestyle="--", linewidth=2)
ax.axvline(mean2, color="tab:orange", linestyle="--", linewidth=2)
ax.set_title("Bootstrap Mean Distributions", fontsize=14)
ax.set_xlabel("Mean", fontsize=12)
ax.set_ylabel("Density", fontsize=12)
ax.grid(True, linestyle="--", alpha=0.5)
ax.text(
0.98,
0.95,
f"p = {round(p_val, 3)}\n"
f"Mean({name_group1}) = {round(mean1, 2)}\n"
f"Mean({name_group2}) = {round(mean2, 2)}",
transform=ax.transAxes,
ha="right",
va="top",
bbox=dict(boxstyle="round", facecolor="white", alpha=0.7),
fontsize=11,
)
ax.legend()
plt.tight_layout()
return fig
def two_sample_ttest(
group1: np.ndarray,
group2: np.ndarray,
*,
numeric_col: str,
name_group1: str,
name_group2: str,
alternative: str,
correction: bool,
plot_type: str,
bootstrap_samples: int,
include_graph: bool,
) -> tuple[pd.DataFrame, plt.Figure | None]:
"""
Two-sample Student's t-test (pingouin.ttest) plus optional plots.
"""
if group1.size == 0 or group2.size == 0:
raise ValueError("One or both groups are empty after filtering.")
df_output = pg.ttest(
x=group1,
y=group2,
alternative=alternative,
paired=False,
correction=correction,
)
fig = None
if include_graph:
if plot_type == "Sample Histogram":
fig = mirror_plot(
numeric_col=numeric_col,
group1=group1,
name_group1=name_group1,
group2=group2,
name_group2=name_group2,
df_output=df_output,
)
elif plot_type == "Mean Density":
fig = plot_mean_distribution(
group1=group1,
name_group1=name_group1,
group2=group2,
name_group2=name_group2,
bootstrap_samples=bootstrap_samples,
df_output=df_output,
)
return df_output, fig
# ============================================================
# Variance tests (Bartlett / Levene)
# ============================================================
def plot_variance_distribution(
p: float,
group1: np.ndarray,
name_group1: str,
var1: float,
group2: np.ndarray,
name_group2: str,
var2: float,
method: str,
bootstrap_samples: int,
):
"""
Bootstrap distributions of sample variances for two groups.
"""
boot1 = [
np.var(np.random.choice(group1, size=len(group1), replace=True), ddof=1)
for _ in range(bootstrap_samples)
]
boot2 = [
np.var(np.random.choice(group2, size=len(group2), replace=True), ddof=1)
for _ in range(bootstrap_samples)
]
fig, ax = plt.subplots(figsize=(8, 5))
sns.kdeplot(
boot1,
label=f"{name_group1} variance",
fill=True,
color="rebeccapurple",
alpha=0.6,
ax=ax,
)
sns.kdeplot(
boot2,
label=f"{name_group2} variance",
fill=True,
color="tab:orange",
alpha=0.6,
ax=ax,
)
ax.axvline(var1, color="rebeccapurple", linestyle="--", linewidth=2)
ax.axvline(var2, color="tab:orange", linestyle="--", linewidth=2)
ax.set_title(f"Bootstrap Variance Distributions\n{method}", fontsize=14)
ax.set_xlabel("Variance", fontsize=12)
ax.set_ylabel("Density", fontsize=12)
ax.grid(True, linestyle="--", alpha=0.5)
ax.text(
0.98,
0.95,
f"{method}\n"
f"p = {round(p, 3)}\n"
f"Var({name_group1}) = {round(var1, 2)}\n"
f"Var({name_group2}) = {round(var2, 2)}",
transform=ax.transAxes,
ha="right",
va="top",
bbox=dict(boxstyle="round", facecolor="white", alpha=0.7),
fontsize=11,
)
ax.legend()
plt.tight_layout()
return fig
def variance_test(
group1: np.ndarray,
group2: np.ndarray,
*,
name_group1: str,
name_group2: str,
test_type: str,
include_graph: bool,
bootstrap_samples: int,
) -> tuple[pd.DataFrame, plt.Figure | None]:
"""
Bartlett or Levene test + optional bootstrap variance plots.
"""
if group1.size == 0 or group2.size == 0:
raise ValueError("One or both groups are empty after filtering.")
if test_type == "Bartlett":
stat, p = bartlett(group1, group2)
method = "Bartlett's test"
elif test_type == "Levene":
stat, p = levene(group1, group2, center="mean")
method = "Levene's test"
else:
raise ValueError("Invalid test type selected.")
var1 = float(np.var(group1, ddof=1))
var2 = float(np.var(group2, ddof=1))
df_output = pd.DataFrame(
{
"Test": [method],
"Statistic": [stat],
"p-value": [p],
f"Var({name_group1})": [var1],
f"Var({name_group2})": [var2],
}
)
fig = None
if include_graph:
fig = plot_variance_distribution(
p=p,
group1=group1,
name_group1=name_group1,
var1=var1,
group2=group2,
name_group2=name_group2,
var2=var2,
method=method,
bootstrap_samples=bootstrap_samples,
)
return df_output, fig
# ============================================================
# One-way ANOVA
# ============================================================
def one_way_anova_plot(
data_group: pd.DataFrame,
numeric_col: str,
cat_col: str,
df_output: pd.DataFrame,
):
"""
KDE plot of group distributions with F/p annotation.
"""
p_val = df_output["p-unc"].values[0]
groups = sorted(data_group[cat_col].dropna().unique())
palette = sns.color_palette("tab10", n_colors=len(groups))
group_color_map = dict(zip(groups, palette))
fig, ax = plt.subplots(figsize=(8, 5))
for group in groups:
subset = data_group[data_group[cat_col] == group][numeric_col].dropna()
sns.kdeplot(
subset,
fill=True,
common_norm=False,
color=group_color_map[group],
alpha=0.5,
linewidth=1,
label=str(group),
ax=ax,
)
overall_mean = data_group[numeric_col].mean()
ax.axvline(
overall_mean,
color="black",
linestyle=":",
linewidth=1.2,
label="Overall mean",
)
group_means = data_group.groupby(cat_col)[numeric_col].mean()
for group, mean_val in group_means.items():
ax.axvline(
mean_val,
color=group_color_map[group],
linestyle="--",
linewidth=1.5,
label=f"{group} mean",
)
ax.text(
0.98,
0.95,
f"p = {p_val:.3f}",
transform=ax.transAxes,
ha="right",
va="top",
bbox=dict(boxstyle="round", facecolor="white", alpha=0.7),
fontsize=11,
)
ax.set_title("Group Distributions for One-way ANOVA", fontsize=14)
ax.set_xlabel(numeric_col, fontsize=12)
ax.set_ylabel("Density", fontsize=12)
ax.grid(True, linestyle="--", alpha=0.3)
ax.legend(title=cat_col)
plt.tight_layout()
return fig
def one_way_anova(
data_group: pd.DataFrame,
*,
numeric_col: str,
cat_col: str,
) -> tuple[pd.DataFrame, plt.Figure]:
"""
One-way ANOVA (pingouin.anova) + distribution plot.
"""
if data_group.empty:
raise ValueError("Dataset is empty after filtering.")
df_output = pg.anova(
dv=numeric_col,
between=cat_col,
data=data_group,
detailed=True,
)
fig = one_way_anova_plot(
data_group=data_group,
numeric_col=numeric_col,
cat_col=cat_col,
df_output=df_output,
)
return df_output, fig