import numpy as np import pandas as pd import matplotlib.pyplot as plt import seaborn as sns import pingouin as pg from scipy.stats import t, gaussian_kde, bartlett, levene # ============================================================ # One-sample t-test # ============================================================ def plot_ttest_mean_distribution( numeric_col: str, sample: np.ndarray, mu0: float, df_output: pd.DataFrame, alternative: str, bootstrap_samples: int, ): """ Plot bootstrap sampling distribution of the mean vs. theoretical t under H0, shading the p-value region. """ plt.style.use("seaborn-v0_8-whitegrid") # Pingouin output p_val = df_output["p-val"].values[0] df = df_output["dof"].values[0] # Sample stats n = len(sample) sample_mean = np.mean(sample) sample_std = np.std(sample, ddof=1) se = sample_std / np.sqrt(n) # Theoretical t-distribution under H0 x = np.linspace(mu0 - 5 * se, mu0 + 5 * se, 1000) t_density = t.pdf((x - mu0) / se, df) / se # Bootstrap sampling distribution of the mean boot_means = np.array( [ np.mean(np.random.choice(sample, size=n, replace=True)) for _ in range(bootstrap_samples) ] ) # KDE for bootstrap means (off-screen fig to get line data) fig_tmp, ax_tmp = plt.subplots() sns.kdeplot(boot_means, bw_adjust=1.2, ax=ax_tmp) x_kde, y_kde = ax_tmp.lines[0].get_data() plt.close(fig_tmp) # Final figure fig, ax = plt.subplots(figsize=(8, 5)) # Bootstrap KDE ax.plot( x_kde, y_kde, color="rebeccapurple", label="Bootstrap sampling dist.", linewidth=2, ) # Theoretical t under H0 ax.plot( x, t_density, color="gray", linestyle="--", linewidth=2, label=r"$t$-distribution ($H_0$)", ) # Shade p-value region if alternative == "two-sided": delta = abs(sample_mean - mu0) lower = mu0 - delta upper = mu0 + delta mask = (x <= lower) | (x >= upper) elif alternative == "greater": mask = x >= sample_mean elif alternative == "less": mask = x <= sample_mean else: raise ValueError("alternative must be 'two-sided', 'greater', or 'less'") ax.fill_between( x, 0, t_density, where=mask, color="red", alpha=0.3, label=f"p-value ≈ {p_val:.3f}", ) # Reference lines ax.axvline( mu0, color="tab:orange", linestyle="--", linewidth=2, label=rf"$\mu_0 = {mu0}$", ) ax.axvline( sample_mean, color="black", linestyle="-", linewidth=1.5, label=rf"Sample mean = {sample_mean:.2f}", ) # Formatting ax.set_title( f"Sampling Distribution of the Mean ({numeric_col})", fontsize=14, ) ax.set_xlabel("Sample Mean", fontsize=12) ax.set_ylabel("Density", fontsize=12) ax.grid(True, linestyle="--", alpha=0.5) ax.legend() plt.tight_layout() return fig def one_sample_ttest( sample: np.ndarray, mu0: float, alternative: str, *, numeric_col: str, bootstrap_samples: int, include_graph: bool, ) -> tuple[pd.DataFrame, plt.Figure | None]: """ One-sample Student's t-test and optional sampling distribution plot. """ if sample.size == 0: raise ValueError("No valid data in the selected column.") df_output = pg.ttest( x=sample, y=mu0, alternative=alternative, paired=False, ) fig = None if include_graph: fig = plot_ttest_mean_distribution( numeric_col=numeric_col, sample=sample, mu0=mu0, df_output=df_output, alternative=alternative, bootstrap_samples=bootstrap_samples, ) return df_output, fig # ============================================================ # Two-sample t-test (means) # ============================================================ def mirror_plot( numeric_col: str, group1: np.ndarray, name_group1: str, group2: np.ndarray, name_group2: str, df_output: pd.DataFrame, ): """ Mirror histogram + KDE plot for two groups. """ t_val = df_output["T"].values[0] p_val = df_output["p-val"].values[0] mean1 = np.mean(group1) mean2 = np.mean(group2) fig, ax = plt.subplots(figsize=(10, 6)) # Shared binning combined = np.concatenate([group1, group2]) x_min, x_max = float(np.min(combined)), float(np.max(combined)) bin_range = np.linspace(x_min, x_max, 30) bin_centers = (bin_range[:-1] + bin_range[1:]) / 2 bin_width = np.diff(bin_range)[0] x_vals = np.linspace(x_min, x_max, 200) # Group 1 (top) sns.histplot( group1, bins=bin_range, stat="density", kde=False, color="rebeccapurple", label=name_group1, alpha=0.6, ax=ax, ) kde1 = gaussian_kde(group1) ax.plot(x_vals, kde1(x_vals), color="rebeccapurple", linewidth=2) ax.axvline( mean1, color="rebeccapurple", linestyle="--", linewidth=2, label=f"{name_group1} mean = {mean1:.2f}", ) # Group 2 (bottom, mirrored) heights2, _ = np.histogram(group2, bins=bin_range, density=True) ax.bar( bin_centers, -heights2, width=bin_width, color="tab:orange", edgecolor="black", alpha=0.6, label=name_group2, ) kde2 = gaussian_kde(group2) ax.plot(x_vals, -kde2(x_vals), color="tab:orange", linewidth=2) ax.axvline( mean2, color="tab:orange", linestyle="--", linewidth=2, label=f"{name_group2} mean = {mean2:.2f}", ) ax.axhline(0, color="black", linewidth=1) ax.set_title("Mirror Plot: Two-Sample Distribution Comparison", fontsize=14) ax.set_xlabel(numeric_col) ax.set_ylabel("Density (Top ↑ vs. Bottom ↓)", fontsize=11) ax.text( 0.01, 0.95, f"p = {p_val:.3f}", transform=ax.transAxes, fontsize=11, verticalalignment="top", bbox=dict(boxstyle="round", facecolor="white", alpha=0.6), ) ax.legend() plt.tight_layout() return fig def plot_mean_distribution( group1: np.ndarray, name_group1: str, group2: np.ndarray, name_group2: str, bootstrap_samples: int, df_output: pd.DataFrame, ): """ Bootstrap distributions of the two sample means. """ p_val = df_output["p-val"].values[0] mean1 = np.mean(group1) mean2 = np.mean(group2) boot1 = [ np.mean(np.random.choice(group1, size=len(group1), replace=True)) for _ in range(bootstrap_samples) ] boot2 = [ np.mean(np.random.choice(group2, size=len(group2), replace=True)) for _ in range(bootstrap_samples) ] fig, ax = plt.subplots(figsize=(8, 5)) sns.kdeplot( boot1, label=f"{name_group1} mean", fill=True, color="rebeccapurple", alpha=0.6, ax=ax, ) sns.kdeplot( boot2, label=f"{name_group2} mean", fill=True, color="tab:orange", alpha=0.6, ax=ax, ) ax.axvline(mean1, color="rebeccapurple", linestyle="--", linewidth=2) ax.axvline(mean2, color="tab:orange", linestyle="--", linewidth=2) ax.set_title("Bootstrap Mean Distributions", fontsize=14) ax.set_xlabel("Mean", fontsize=12) ax.set_ylabel("Density", fontsize=12) ax.grid(True, linestyle="--", alpha=0.5) ax.text( 0.98, 0.95, f"p = {round(p_val, 3)}\n" f"Mean({name_group1}) = {round(mean1, 2)}\n" f"Mean({name_group2}) = {round(mean2, 2)}", transform=ax.transAxes, ha="right", va="top", bbox=dict(boxstyle="round", facecolor="white", alpha=0.7), fontsize=11, ) ax.legend() plt.tight_layout() return fig def two_sample_ttest( group1: np.ndarray, group2: np.ndarray, *, numeric_col: str, name_group1: str, name_group2: str, alternative: str, correction: bool, plot_type: str, bootstrap_samples: int, include_graph: bool, ) -> tuple[pd.DataFrame, plt.Figure | None]: """ Two-sample Student's t-test (pingouin.ttest) plus optional plots. """ if group1.size == 0 or group2.size == 0: raise ValueError("One or both groups are empty after filtering.") df_output = pg.ttest( x=group1, y=group2, alternative=alternative, paired=False, correction=correction, ) fig = None if include_graph: if plot_type == "Sample Histogram": fig = mirror_plot( numeric_col=numeric_col, group1=group1, name_group1=name_group1, group2=group2, name_group2=name_group2, df_output=df_output, ) elif plot_type == "Mean Density": fig = plot_mean_distribution( group1=group1, name_group1=name_group1, group2=group2, name_group2=name_group2, bootstrap_samples=bootstrap_samples, df_output=df_output, ) return df_output, fig # ============================================================ # Variance tests (Bartlett / Levene) # ============================================================ def plot_variance_distribution( p: float, group1: np.ndarray, name_group1: str, var1: float, group2: np.ndarray, name_group2: str, var2: float, method: str, bootstrap_samples: int, ): """ Bootstrap distributions of sample variances for two groups. """ boot1 = [ np.var(np.random.choice(group1, size=len(group1), replace=True), ddof=1) for _ in range(bootstrap_samples) ] boot2 = [ np.var(np.random.choice(group2, size=len(group2), replace=True), ddof=1) for _ in range(bootstrap_samples) ] fig, ax = plt.subplots(figsize=(8, 5)) sns.kdeplot( boot1, label=f"{name_group1} variance", fill=True, color="rebeccapurple", alpha=0.6, ax=ax, ) sns.kdeplot( boot2, label=f"{name_group2} variance", fill=True, color="tab:orange", alpha=0.6, ax=ax, ) ax.axvline(var1, color="rebeccapurple", linestyle="--", linewidth=2) ax.axvline(var2, color="tab:orange", linestyle="--", linewidth=2) ax.set_title(f"Bootstrap Variance Distributions\n{method}", fontsize=14) ax.set_xlabel("Variance", fontsize=12) ax.set_ylabel("Density", fontsize=12) ax.grid(True, linestyle="--", alpha=0.5) ax.text( 0.98, 0.95, f"{method}\n" f"p = {round(p, 3)}\n" f"Var({name_group1}) = {round(var1, 2)}\n" f"Var({name_group2}) = {round(var2, 2)}", transform=ax.transAxes, ha="right", va="top", bbox=dict(boxstyle="round", facecolor="white", alpha=0.7), fontsize=11, ) ax.legend() plt.tight_layout() return fig def variance_test( group1: np.ndarray, group2: np.ndarray, *, name_group1: str, name_group2: str, test_type: str, include_graph: bool, bootstrap_samples: int, ) -> tuple[pd.DataFrame, plt.Figure | None]: """ Bartlett or Levene test + optional bootstrap variance plots. """ if group1.size == 0 or group2.size == 0: raise ValueError("One or both groups are empty after filtering.") if test_type == "Bartlett": stat, p = bartlett(group1, group2) method = "Bartlett's test" elif test_type == "Levene": stat, p = levene(group1, group2, center="mean") method = "Levene's test" else: raise ValueError("Invalid test type selected.") var1 = float(np.var(group1, ddof=1)) var2 = float(np.var(group2, ddof=1)) df_output = pd.DataFrame( { "Test": [method], "Statistic": [stat], "p-value": [p], f"Var({name_group1})": [var1], f"Var({name_group2})": [var2], } ) fig = None if include_graph: fig = plot_variance_distribution( p=p, group1=group1, name_group1=name_group1, var1=var1, group2=group2, name_group2=name_group2, var2=var2, method=method, bootstrap_samples=bootstrap_samples, ) return df_output, fig # ============================================================ # One-way ANOVA # ============================================================ def one_way_anova_plot( data_group: pd.DataFrame, numeric_col: str, cat_col: str, df_output: pd.DataFrame, ): """ KDE plot of group distributions with F/p annotation. """ p_val = df_output["p-unc"].values[0] groups = sorted(data_group[cat_col].dropna().unique()) palette = sns.color_palette("tab10", n_colors=len(groups)) group_color_map = dict(zip(groups, palette)) fig, ax = plt.subplots(figsize=(8, 5)) for group in groups: subset = data_group[data_group[cat_col] == group][numeric_col].dropna() sns.kdeplot( subset, fill=True, common_norm=False, color=group_color_map[group], alpha=0.5, linewidth=1, label=str(group), ax=ax, ) overall_mean = data_group[numeric_col].mean() ax.axvline( overall_mean, color="black", linestyle=":", linewidth=1.2, label="Overall mean", ) group_means = data_group.groupby(cat_col)[numeric_col].mean() for group, mean_val in group_means.items(): ax.axvline( mean_val, color=group_color_map[group], linestyle="--", linewidth=1.5, label=f"{group} mean", ) ax.text( 0.98, 0.95, f"p = {p_val:.3f}", transform=ax.transAxes, ha="right", va="top", bbox=dict(boxstyle="round", facecolor="white", alpha=0.7), fontsize=11, ) ax.set_title("Group Distributions for One-way ANOVA", fontsize=14) ax.set_xlabel(numeric_col, fontsize=12) ax.set_ylabel("Density", fontsize=12) ax.grid(True, linestyle="--", alpha=0.3) ax.legend(title=cat_col) plt.tight_layout() return fig def one_way_anova( data_group: pd.DataFrame, *, numeric_col: str, cat_col: str, ) -> tuple[pd.DataFrame, plt.Figure]: """ One-way ANOVA (pingouin.anova) + distribution plot. """ if data_group.empty: raise ValueError("Dataset is empty after filtering.") df_output = pg.anova( dv=numeric_col, between=cat_col, data=data_group, detailed=True, ) fig = one_way_anova_plot( data_group=data_group, numeric_col=numeric_col, cat_col=cat_col, df_output=df_output, ) return df_output, fig