| import numpy as np
|
| import pandas as pd
|
| import matplotlib.pyplot as plt
|
| import seaborn as sns
|
| import pingouin as pg
|
|
|
| from scipy.stats import t, gaussian_kde, bartlett, levene
|
|
|
|
|
|
|
|
|
|
|
| def plot_ttest_mean_distribution(
|
| numeric_col: str,
|
| sample: np.ndarray,
|
| mu0: float,
|
| df_output: pd.DataFrame,
|
| alternative: str,
|
| bootstrap_samples: int,
|
| ):
|
| """
|
| Plot bootstrap sampling distribution of the mean vs. theoretical t under H0,
|
| shading the p-value region.
|
| """
|
| plt.style.use("seaborn-v0_8-whitegrid")
|
|
|
|
|
| p_val = df_output["p-val"].values[0]
|
| df = df_output["dof"].values[0]
|
|
|
|
|
| n = len(sample)
|
| sample_mean = np.mean(sample)
|
| sample_std = np.std(sample, ddof=1)
|
| se = sample_std / np.sqrt(n)
|
|
|
|
|
| x = np.linspace(mu0 - 5 * se, mu0 + 5 * se, 1000)
|
| t_density = t.pdf((x - mu0) / se, df) / se
|
|
|
|
|
| boot_means = np.array(
|
| [
|
| np.mean(np.random.choice(sample, size=n, replace=True))
|
| for _ in range(bootstrap_samples)
|
| ]
|
| )
|
|
|
|
|
| fig_tmp, ax_tmp = plt.subplots()
|
| sns.kdeplot(boot_means, bw_adjust=1.2, ax=ax_tmp)
|
| x_kde, y_kde = ax_tmp.lines[0].get_data()
|
| plt.close(fig_tmp)
|
|
|
|
|
| fig, ax = plt.subplots(figsize=(8, 5))
|
|
|
|
|
| ax.plot(
|
| x_kde,
|
| y_kde,
|
| color="rebeccapurple",
|
| label="Bootstrap sampling dist.",
|
| linewidth=2,
|
| )
|
|
|
|
|
| ax.plot(
|
| x,
|
| t_density,
|
| color="gray",
|
| linestyle="--",
|
| linewidth=2,
|
| label=r"$t$-distribution ($H_0$)",
|
| )
|
|
|
|
|
| if alternative == "two-sided":
|
| delta = abs(sample_mean - mu0)
|
| lower = mu0 - delta
|
| upper = mu0 + delta
|
| mask = (x <= lower) | (x >= upper)
|
| elif alternative == "greater":
|
| mask = x >= sample_mean
|
| elif alternative == "less":
|
| mask = x <= sample_mean
|
| else:
|
| raise ValueError("alternative must be 'two-sided', 'greater', or 'less'")
|
|
|
| ax.fill_between(
|
| x,
|
| 0,
|
| t_density,
|
| where=mask,
|
| color="red",
|
| alpha=0.3,
|
| label=f"p-value ≈ {p_val:.3f}",
|
| )
|
|
|
|
|
| ax.axvline(
|
| mu0,
|
| color="tab:orange",
|
| linestyle="--",
|
| linewidth=2,
|
| label=rf"$\mu_0 = {mu0}$",
|
| )
|
| ax.axvline(
|
| sample_mean,
|
| color="black",
|
| linestyle="-",
|
| linewidth=1.5,
|
| label=rf"Sample mean = {sample_mean:.2f}",
|
| )
|
|
|
|
|
| ax.set_title(
|
| f"Sampling Distribution of the Mean ({numeric_col})",
|
| fontsize=14,
|
| )
|
| ax.set_xlabel("Sample Mean", fontsize=12)
|
| ax.set_ylabel("Density", fontsize=12)
|
| ax.grid(True, linestyle="--", alpha=0.5)
|
| ax.legend()
|
| plt.tight_layout()
|
|
|
| return fig
|
|
|
|
|
| def one_sample_ttest(
|
| sample: np.ndarray,
|
| mu0: float,
|
| alternative: str,
|
| *,
|
| numeric_col: str,
|
| bootstrap_samples: int,
|
| include_graph: bool,
|
| ) -> tuple[pd.DataFrame, plt.Figure | None]:
|
| """
|
| One-sample Student's t-test and optional sampling distribution plot.
|
| """
|
| if sample.size == 0:
|
| raise ValueError("No valid data in the selected column.")
|
|
|
| df_output = pg.ttest(
|
| x=sample,
|
| y=mu0,
|
| alternative=alternative,
|
| paired=False,
|
| )
|
|
|
| fig = None
|
| if include_graph:
|
| fig = plot_ttest_mean_distribution(
|
| numeric_col=numeric_col,
|
| sample=sample,
|
| mu0=mu0,
|
| df_output=df_output,
|
| alternative=alternative,
|
| bootstrap_samples=bootstrap_samples,
|
| )
|
|
|
| return df_output, fig
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| def mirror_plot(
|
| numeric_col: str,
|
| group1: np.ndarray,
|
| name_group1: str,
|
| group2: np.ndarray,
|
| name_group2: str,
|
| df_output: pd.DataFrame,
|
| ):
|
| """
|
| Mirror histogram + KDE plot for two groups.
|
| """
|
| t_val = df_output["T"].values[0]
|
| p_val = df_output["p-val"].values[0]
|
|
|
| mean1 = np.mean(group1)
|
| mean2 = np.mean(group2)
|
|
|
| fig, ax = plt.subplots(figsize=(10, 6))
|
|
|
|
|
| combined = np.concatenate([group1, group2])
|
| x_min, x_max = float(np.min(combined)), float(np.max(combined))
|
| bin_range = np.linspace(x_min, x_max, 30)
|
| bin_centers = (bin_range[:-1] + bin_range[1:]) / 2
|
| bin_width = np.diff(bin_range)[0]
|
| x_vals = np.linspace(x_min, x_max, 200)
|
|
|
|
|
| sns.histplot(
|
| group1,
|
| bins=bin_range,
|
| stat="density",
|
| kde=False,
|
| color="rebeccapurple",
|
| label=name_group1,
|
| alpha=0.6,
|
| ax=ax,
|
| )
|
| kde1 = gaussian_kde(group1)
|
| ax.plot(x_vals, kde1(x_vals), color="rebeccapurple", linewidth=2)
|
| ax.axvline(
|
| mean1,
|
| color="rebeccapurple",
|
| linestyle="--",
|
| linewidth=2,
|
| label=f"{name_group1} mean = {mean1:.2f}",
|
| )
|
|
|
|
|
| heights2, _ = np.histogram(group2, bins=bin_range, density=True)
|
| ax.bar(
|
| bin_centers,
|
| -heights2,
|
| width=bin_width,
|
| color="tab:orange",
|
| edgecolor="black",
|
| alpha=0.6,
|
| label=name_group2,
|
| )
|
| kde2 = gaussian_kde(group2)
|
| ax.plot(x_vals, -kde2(x_vals), color="tab:orange", linewidth=2)
|
| ax.axvline(
|
| mean2,
|
| color="tab:orange",
|
| linestyle="--",
|
| linewidth=2,
|
| label=f"{name_group2} mean = {mean2:.2f}",
|
| )
|
|
|
| ax.axhline(0, color="black", linewidth=1)
|
|
|
| ax.set_title("Mirror Plot: Two-Sample Distribution Comparison", fontsize=14)
|
| ax.set_xlabel(numeric_col)
|
| ax.set_ylabel("Density (Top ↑ vs. Bottom ↓)", fontsize=11)
|
|
|
| ax.text(
|
| 0.01,
|
| 0.95,
|
| f"p = {p_val:.3f}",
|
| transform=ax.transAxes,
|
| fontsize=11,
|
| verticalalignment="top",
|
| bbox=dict(boxstyle="round", facecolor="white", alpha=0.6),
|
| )
|
|
|
| ax.legend()
|
| plt.tight_layout()
|
| return fig
|
|
|
|
|
| def plot_mean_distribution(
|
| group1: np.ndarray,
|
| name_group1: str,
|
| group2: np.ndarray,
|
| name_group2: str,
|
| bootstrap_samples: int,
|
| df_output: pd.DataFrame,
|
| ):
|
| """
|
| Bootstrap distributions of the two sample means.
|
| """
|
| p_val = df_output["p-val"].values[0]
|
|
|
| mean1 = np.mean(group1)
|
| mean2 = np.mean(group2)
|
|
|
| boot1 = [
|
| np.mean(np.random.choice(group1, size=len(group1), replace=True))
|
| for _ in range(bootstrap_samples)
|
| ]
|
| boot2 = [
|
| np.mean(np.random.choice(group2, size=len(group2), replace=True))
|
| for _ in range(bootstrap_samples)
|
| ]
|
|
|
| fig, ax = plt.subplots(figsize=(8, 5))
|
| sns.kdeplot(
|
| boot1,
|
| label=f"{name_group1} mean",
|
| fill=True,
|
| color="rebeccapurple",
|
| alpha=0.6,
|
| ax=ax,
|
| )
|
| sns.kdeplot(
|
| boot2,
|
| label=f"{name_group2} mean",
|
| fill=True,
|
| color="tab:orange",
|
| alpha=0.6,
|
| ax=ax,
|
| )
|
|
|
| ax.axvline(mean1, color="rebeccapurple", linestyle="--", linewidth=2)
|
| ax.axvline(mean2, color="tab:orange", linestyle="--", linewidth=2)
|
|
|
| ax.set_title("Bootstrap Mean Distributions", fontsize=14)
|
| ax.set_xlabel("Mean", fontsize=12)
|
| ax.set_ylabel("Density", fontsize=12)
|
| ax.grid(True, linestyle="--", alpha=0.5)
|
|
|
| ax.text(
|
| 0.98,
|
| 0.95,
|
| f"p = {round(p_val, 3)}\n"
|
| f"Mean({name_group1}) = {round(mean1, 2)}\n"
|
| f"Mean({name_group2}) = {round(mean2, 2)}",
|
| transform=ax.transAxes,
|
| ha="right",
|
| va="top",
|
| bbox=dict(boxstyle="round", facecolor="white", alpha=0.7),
|
| fontsize=11,
|
| )
|
|
|
| ax.legend()
|
| plt.tight_layout()
|
|
|
| return fig
|
|
|
|
|
| def two_sample_ttest(
|
| group1: np.ndarray,
|
| group2: np.ndarray,
|
| *,
|
| numeric_col: str,
|
| name_group1: str,
|
| name_group2: str,
|
| alternative: str,
|
| correction: bool,
|
| plot_type: str,
|
| bootstrap_samples: int,
|
| include_graph: bool,
|
| ) -> tuple[pd.DataFrame, plt.Figure | None]:
|
| """
|
| Two-sample Student's t-test (pingouin.ttest) plus optional plots.
|
| """
|
| if group1.size == 0 or group2.size == 0:
|
| raise ValueError("One or both groups are empty after filtering.")
|
|
|
| df_output = pg.ttest(
|
| x=group1,
|
| y=group2,
|
| alternative=alternative,
|
| paired=False,
|
| correction=correction,
|
| )
|
|
|
| fig = None
|
| if include_graph:
|
| if plot_type == "Sample Histogram":
|
| fig = mirror_plot(
|
| numeric_col=numeric_col,
|
| group1=group1,
|
| name_group1=name_group1,
|
| group2=group2,
|
| name_group2=name_group2,
|
| df_output=df_output,
|
| )
|
| elif plot_type == "Mean Density":
|
| fig = plot_mean_distribution(
|
| group1=group1,
|
| name_group1=name_group1,
|
| group2=group2,
|
| name_group2=name_group2,
|
| bootstrap_samples=bootstrap_samples,
|
| df_output=df_output,
|
| )
|
|
|
| return df_output, fig
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| def plot_variance_distribution(
|
| p: float,
|
| group1: np.ndarray,
|
| name_group1: str,
|
| var1: float,
|
| group2: np.ndarray,
|
| name_group2: str,
|
| var2: float,
|
| method: str,
|
| bootstrap_samples: int,
|
| ):
|
| """
|
| Bootstrap distributions of sample variances for two groups.
|
| """
|
| boot1 = [
|
| np.var(np.random.choice(group1, size=len(group1), replace=True), ddof=1)
|
| for _ in range(bootstrap_samples)
|
| ]
|
| boot2 = [
|
| np.var(np.random.choice(group2, size=len(group2), replace=True), ddof=1)
|
| for _ in range(bootstrap_samples)
|
| ]
|
|
|
| fig, ax = plt.subplots(figsize=(8, 5))
|
| sns.kdeplot(
|
| boot1,
|
| label=f"{name_group1} variance",
|
| fill=True,
|
| color="rebeccapurple",
|
| alpha=0.6,
|
| ax=ax,
|
| )
|
| sns.kdeplot(
|
| boot2,
|
| label=f"{name_group2} variance",
|
| fill=True,
|
| color="tab:orange",
|
| alpha=0.6,
|
| ax=ax,
|
| )
|
|
|
| ax.axvline(var1, color="rebeccapurple", linestyle="--", linewidth=2)
|
| ax.axvline(var2, color="tab:orange", linestyle="--", linewidth=2)
|
|
|
| ax.set_title(f"Bootstrap Variance Distributions\n{method}", fontsize=14)
|
| ax.set_xlabel("Variance", fontsize=12)
|
| ax.set_ylabel("Density", fontsize=12)
|
| ax.grid(True, linestyle="--", alpha=0.5)
|
|
|
| ax.text(
|
| 0.98,
|
| 0.95,
|
| f"{method}\n"
|
| f"p = {round(p, 3)}\n"
|
| f"Var({name_group1}) = {round(var1, 2)}\n"
|
| f"Var({name_group2}) = {round(var2, 2)}",
|
| transform=ax.transAxes,
|
| ha="right",
|
| va="top",
|
| bbox=dict(boxstyle="round", facecolor="white", alpha=0.7),
|
| fontsize=11,
|
| )
|
|
|
| ax.legend()
|
| plt.tight_layout()
|
|
|
| return fig
|
|
|
|
|
| def variance_test(
|
| group1: np.ndarray,
|
| group2: np.ndarray,
|
| *,
|
| name_group1: str,
|
| name_group2: str,
|
| test_type: str,
|
| include_graph: bool,
|
| bootstrap_samples: int,
|
| ) -> tuple[pd.DataFrame, plt.Figure | None]:
|
| """
|
| Bartlett or Levene test + optional bootstrap variance plots.
|
| """
|
| if group1.size == 0 or group2.size == 0:
|
| raise ValueError("One or both groups are empty after filtering.")
|
|
|
| if test_type == "Bartlett":
|
| stat, p = bartlett(group1, group2)
|
| method = "Bartlett's test"
|
| elif test_type == "Levene":
|
| stat, p = levene(group1, group2, center="mean")
|
| method = "Levene's test"
|
| else:
|
| raise ValueError("Invalid test type selected.")
|
|
|
| var1 = float(np.var(group1, ddof=1))
|
| var2 = float(np.var(group2, ddof=1))
|
|
|
| df_output = pd.DataFrame(
|
| {
|
| "Test": [method],
|
| "Statistic": [stat],
|
| "p-value": [p],
|
| f"Var({name_group1})": [var1],
|
| f"Var({name_group2})": [var2],
|
| }
|
| )
|
|
|
| fig = None
|
| if include_graph:
|
| fig = plot_variance_distribution(
|
| p=p,
|
| group1=group1,
|
| name_group1=name_group1,
|
| var1=var1,
|
| group2=group2,
|
| name_group2=name_group2,
|
| var2=var2,
|
| method=method,
|
| bootstrap_samples=bootstrap_samples,
|
| )
|
|
|
| return df_output, fig
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| def one_way_anova_plot(
|
| data_group: pd.DataFrame,
|
| numeric_col: str,
|
| cat_col: str,
|
| df_output: pd.DataFrame,
|
| ):
|
| """
|
| KDE plot of group distributions with F/p annotation.
|
| """
|
| p_val = df_output["p-unc"].values[0]
|
|
|
| groups = sorted(data_group[cat_col].dropna().unique())
|
| palette = sns.color_palette("tab10", n_colors=len(groups))
|
| group_color_map = dict(zip(groups, palette))
|
|
|
| fig, ax = plt.subplots(figsize=(8, 5))
|
|
|
| for group in groups:
|
| subset = data_group[data_group[cat_col] == group][numeric_col].dropna()
|
| sns.kdeplot(
|
| subset,
|
| fill=True,
|
| common_norm=False,
|
| color=group_color_map[group],
|
| alpha=0.5,
|
| linewidth=1,
|
| label=str(group),
|
| ax=ax,
|
| )
|
|
|
| overall_mean = data_group[numeric_col].mean()
|
| ax.axvline(
|
| overall_mean,
|
| color="black",
|
| linestyle=":",
|
| linewidth=1.2,
|
| label="Overall mean",
|
| )
|
|
|
| group_means = data_group.groupby(cat_col)[numeric_col].mean()
|
| for group, mean_val in group_means.items():
|
| ax.axvline(
|
| mean_val,
|
| color=group_color_map[group],
|
| linestyle="--",
|
| linewidth=1.5,
|
| label=f"{group} mean",
|
| )
|
|
|
| ax.text(
|
| 0.98,
|
| 0.95,
|
| f"p = {p_val:.3f}",
|
| transform=ax.transAxes,
|
| ha="right",
|
| va="top",
|
| bbox=dict(boxstyle="round", facecolor="white", alpha=0.7),
|
| fontsize=11,
|
| )
|
|
|
| ax.set_title("Group Distributions for One-way ANOVA", fontsize=14)
|
| ax.set_xlabel(numeric_col, fontsize=12)
|
| ax.set_ylabel("Density", fontsize=12)
|
| ax.grid(True, linestyle="--", alpha=0.3)
|
| ax.legend(title=cat_col)
|
| plt.tight_layout()
|
|
|
| return fig
|
|
|
|
|
| def one_way_anova(
|
| data_group: pd.DataFrame,
|
| *,
|
| numeric_col: str,
|
| cat_col: str,
|
| ) -> tuple[pd.DataFrame, plt.Figure]:
|
| """
|
| One-way ANOVA (pingouin.anova) + distribution plot.
|
| """
|
| if data_group.empty:
|
| raise ValueError("Dataset is empty after filtering.")
|
|
|
| df_output = pg.anova(
|
| dv=numeric_col,
|
| between=cat_col,
|
| data=data_group,
|
| detailed=True,
|
| )
|
|
|
| fig = one_way_anova_plot(
|
| data_group=data_group,
|
| numeric_col=numeric_col,
|
| cat_col=cat_col,
|
| df_output=df_output,
|
| )
|
|
|
| return df_output, fig
|
|
|