Spaces:

Beam2513
/

again

Sleeping

App Files Files Community

again / core /hypothesis_tests.py

Beam2513

Upload 127 files

798602c verified about 1 month ago

raw

history blame contribute delete

16 kB

	import numpy as np
	import pandas as pd
	import matplotlib.pyplot as plt
	import seaborn as sns
	import pingouin as pg

	from scipy.stats import t, gaussian_kde, bartlett, levene

	# ============================================================
	# One-sample t-test
	# ============================================================

	def plot_ttest_mean_distribution(
	numeric_col: str,
	sample: np.ndarray,
	mu0: float,
	df_output: pd.DataFrame,
	alternative: str,
	bootstrap_samples: int,
	):
	"""
	Plot bootstrap sampling distribution of the mean vs. theoretical t under H0,
	shading the p-value region.
	"""
	plt.style.use("seaborn-v0_8-whitegrid")

	# Pingouin output
	p_val = df_output["p-val"].values[0]
	df = df_output["dof"].values[0]

	# Sample stats
	n = len(sample)
	sample_mean = np.mean(sample)
	sample_std = np.std(sample, ddof=1)
	se = sample_std / np.sqrt(n)

	# Theoretical t-distribution under H0
	x = np.linspace(mu0 - 5 * se, mu0 + 5 * se, 1000)
	t_density = t.pdf((x - mu0) / se, df) / se

	# Bootstrap sampling distribution of the mean
	boot_means = np.array(
	[
	np.mean(np.random.choice(sample, size=n, replace=True))
	for _ in range(bootstrap_samples)
	]
	)

	# KDE for bootstrap means (off-screen fig to get line data)
	fig_tmp, ax_tmp = plt.subplots()
	sns.kdeplot(boot_means, bw_adjust=1.2, ax=ax_tmp)
	x_kde, y_kde = ax_tmp.lines[0].get_data()
	plt.close(fig_tmp)

	# Final figure
	fig, ax = plt.subplots(figsize=(8, 5))

	# Bootstrap KDE
	ax.plot(
	x_kde,
	y_kde,
	color="rebeccapurple",
	label="Bootstrap sampling dist.",
	linewidth=2,
	)

	# Theoretical t under H0
	ax.plot(
	x,
	t_density,
	color="gray",
	linestyle="--",
	linewidth=2,
	label=r"$t$-distribution ($H_0$)",
	)

	# Shade p-value region
	if alternative == "two-sided":
	delta = abs(sample_mean - mu0)
	lower = mu0 - delta
	upper = mu0 + delta
	mask = (x <= lower) \| (x >= upper)
	elif alternative == "greater":
	mask = x >= sample_mean
	elif alternative == "less":
	mask = x <= sample_mean
	else:
	raise ValueError("alternative must be 'two-sided', 'greater', or 'less'")

	ax.fill_between(
	x,
	0,
	t_density,
	where=mask,
	color="red",
	alpha=0.3,
	label=f"p-value ≈ {p_val:.3f}",
	)

	# Reference lines
	ax.axvline(
	mu0,
	color="tab:orange",
	linestyle="--",
	linewidth=2,
	label=rf"$\mu_0 = {mu0}$",
	)
	ax.axvline(
	sample_mean,
	color="black",
	linestyle="-",
	linewidth=1.5,
	label=rf"Sample mean = {sample_mean:.2f}",
	)

	# Formatting
	ax.set_title(
	f"Sampling Distribution of the Mean ({numeric_col})",
	fontsize=14,
	)
	ax.set_xlabel("Sample Mean", fontsize=12)
	ax.set_ylabel("Density", fontsize=12)
	ax.grid(True, linestyle="--", alpha=0.5)
	ax.legend()
	plt.tight_layout()

	return fig


	def one_sample_ttest(
	sample: np.ndarray,
	mu0: float,
	alternative: str,
	*,
	numeric_col: str,
	bootstrap_samples: int,
	include_graph: bool,
	) -> tuple[pd.DataFrame, plt.Figure \| None]:
	"""
	One-sample Student's t-test and optional sampling distribution plot.
	"""
	if sample.size == 0:
	raise ValueError("No valid data in the selected column.")

	df_output = pg.ttest(
	x=sample,
	y=mu0,
	alternative=alternative,
	paired=False,
	)

	fig = None
	if include_graph:
	fig = plot_ttest_mean_distribution(
	numeric_col=numeric_col,
	sample=sample,
	mu0=mu0,
	df_output=df_output,
	alternative=alternative,
	bootstrap_samples=bootstrap_samples,
	)

	return df_output, fig


	# ============================================================
	# Two-sample t-test (means)
	# ============================================================


	def mirror_plot(
	numeric_col: str,
	group1: np.ndarray,
	name_group1: str,
	group2: np.ndarray,
	name_group2: str,
	df_output: pd.DataFrame,
	):
	"""
	Mirror histogram + KDE plot for two groups.
	"""
	t_val = df_output["T"].values[0]
	p_val = df_output["p-val"].values[0]

	mean1 = np.mean(group1)
	mean2 = np.mean(group2)

	fig, ax = plt.subplots(figsize=(10, 6))

	# Shared binning
	combined = np.concatenate([group1, group2])
	x_min, x_max = float(np.min(combined)), float(np.max(combined))
	bin_range = np.linspace(x_min, x_max, 30)
	bin_centers = (bin_range[:-1] + bin_range[1:]) / 2
	bin_width = np.diff(bin_range)[0]
	x_vals = np.linspace(x_min, x_max, 200)

	# Group 1 (top)
	sns.histplot(
	group1,
	bins=bin_range,
	stat="density",
	kde=False,
	color="rebeccapurple",
	label=name_group1,
	alpha=0.6,
	ax=ax,
	)
	kde1 = gaussian_kde(group1)
	ax.plot(x_vals, kde1(x_vals), color="rebeccapurple", linewidth=2)
	ax.axvline(
	mean1,
	color="rebeccapurple",
	linestyle="--",
	linewidth=2,
	label=f"{name_group1} mean = {mean1:.2f}",
	)

	# Group 2 (bottom, mirrored)
	heights2, _ = np.histogram(group2, bins=bin_range, density=True)
	ax.bar(
	bin_centers,
	-heights2,
	width=bin_width,
	color="tab:orange",
	edgecolor="black",
	alpha=0.6,
	label=name_group2,
	)
	kde2 = gaussian_kde(group2)
	ax.plot(x_vals, -kde2(x_vals), color="tab:orange", linewidth=2)
	ax.axvline(
	mean2,
	color="tab:orange",
	linestyle="--",
	linewidth=2,
	label=f"{name_group2} mean = {mean2:.2f}",
	)

	ax.axhline(0, color="black", linewidth=1)

	ax.set_title("Mirror Plot: Two-Sample Distribution Comparison", fontsize=14)
	ax.set_xlabel(numeric_col)
	ax.set_ylabel("Density (Top ↑ vs. Bottom ↓)", fontsize=11)

	ax.text(
	0.01,
	0.95,
	f"p = {p_val:.3f}",
	transform=ax.transAxes,
	fontsize=11,
	verticalalignment="top",
	bbox=dict(boxstyle="round", facecolor="white", alpha=0.6),
	)

	ax.legend()
	plt.tight_layout()
	return fig


	def plot_mean_distribution(
	group1: np.ndarray,
	name_group1: str,
	group2: np.ndarray,
	name_group2: str,
	bootstrap_samples: int,
	df_output: pd.DataFrame,
	):
	"""
	Bootstrap distributions of the two sample means.
	"""
	p_val = df_output["p-val"].values[0]

	mean1 = np.mean(group1)
	mean2 = np.mean(group2)

	boot1 = [
	np.mean(np.random.choice(group1, size=len(group1), replace=True))
	for _ in range(bootstrap_samples)
	]
	boot2 = [
	np.mean(np.random.choice(group2, size=len(group2), replace=True))
	for _ in range(bootstrap_samples)
	]

	fig, ax = plt.subplots(figsize=(8, 5))
	sns.kdeplot(
	boot1,
	label=f"{name_group1} mean",
	fill=True,
	color="rebeccapurple",
	alpha=0.6,
	ax=ax,
	)
	sns.kdeplot(
	boot2,
	label=f"{name_group2} mean",
	fill=True,
	color="tab:orange",
	alpha=0.6,
	ax=ax,
	)

	ax.axvline(mean1, color="rebeccapurple", linestyle="--", linewidth=2)
	ax.axvline(mean2, color="tab:orange", linestyle="--", linewidth=2)

	ax.set_title("Bootstrap Mean Distributions", fontsize=14)
	ax.set_xlabel("Mean", fontsize=12)
	ax.set_ylabel("Density", fontsize=12)
	ax.grid(True, linestyle="--", alpha=0.5)

	ax.text(
	0.98,
	0.95,
	f"p = {round(p_val, 3)}\n"
	f"Mean({name_group1}) = {round(mean1, 2)}\n"
	f"Mean({name_group2}) = {round(mean2, 2)}",
	transform=ax.transAxes,
	ha="right",
	va="top",
	bbox=dict(boxstyle="round", facecolor="white", alpha=0.7),
	fontsize=11,
	)

	ax.legend()
	plt.tight_layout()

	return fig


	def two_sample_ttest(
	group1: np.ndarray,
	group2: np.ndarray,
	*,
	numeric_col: str,
	name_group1: str,
	name_group2: str,
	alternative: str,
	correction: bool,
	plot_type: str,
	bootstrap_samples: int,
	include_graph: bool,
	) -> tuple[pd.DataFrame, plt.Figure \| None]:
	"""
	Two-sample Student's t-test (pingouin.ttest) plus optional plots.
	"""
	if group1.size == 0 or group2.size == 0:
	raise ValueError("One or both groups are empty after filtering.")

	df_output = pg.ttest(
	x=group1,
	y=group2,
	alternative=alternative,
	paired=False,
	correction=correction,
	)

	fig = None
	if include_graph:
	if plot_type == "Sample Histogram":
	fig = mirror_plot(
	numeric_col=numeric_col,
	group1=group1,
	name_group1=name_group1,
	group2=group2,
	name_group2=name_group2,
	df_output=df_output,
	)
	elif plot_type == "Mean Density":
	fig = plot_mean_distribution(
	group1=group1,
	name_group1=name_group1,
	group2=group2,
	name_group2=name_group2,
	bootstrap_samples=bootstrap_samples,
	df_output=df_output,
	)

	return df_output, fig


	# ============================================================
	# Variance tests (Bartlett / Levene)
	# ============================================================


	def plot_variance_distribution(
	p: float,
	group1: np.ndarray,
	name_group1: str,
	var1: float,
	group2: np.ndarray,
	name_group2: str,
	var2: float,
	method: str,
	bootstrap_samples: int,
	):
	"""
	Bootstrap distributions of sample variances for two groups.
	"""
	boot1 = [
	np.var(np.random.choice(group1, size=len(group1), replace=True), ddof=1)
	for _ in range(bootstrap_samples)
	]
	boot2 = [
	np.var(np.random.choice(group2, size=len(group2), replace=True), ddof=1)
	for _ in range(bootstrap_samples)
	]

	fig, ax = plt.subplots(figsize=(8, 5))
	sns.kdeplot(
	boot1,
	label=f"{name_group1} variance",
	fill=True,
	color="rebeccapurple",
	alpha=0.6,
	ax=ax,
	)
	sns.kdeplot(
	boot2,
	label=f"{name_group2} variance",
	fill=True,
	color="tab:orange",
	alpha=0.6,
	ax=ax,
	)

	ax.axvline(var1, color="rebeccapurple", linestyle="--", linewidth=2)
	ax.axvline(var2, color="tab:orange", linestyle="--", linewidth=2)

	ax.set_title(f"Bootstrap Variance Distributions\n{method}", fontsize=14)
	ax.set_xlabel("Variance", fontsize=12)
	ax.set_ylabel("Density", fontsize=12)
	ax.grid(True, linestyle="--", alpha=0.5)

	ax.text(
	0.98,
	0.95,
	f"{method}\n"
	f"p = {round(p, 3)}\n"
	f"Var({name_group1}) = {round(var1, 2)}\n"
	f"Var({name_group2}) = {round(var2, 2)}",
	transform=ax.transAxes,
	ha="right",
	va="top",
	bbox=dict(boxstyle="round", facecolor="white", alpha=0.7),
	fontsize=11,
	)

	ax.legend()
	plt.tight_layout()

	return fig


	def variance_test(
	group1: np.ndarray,
	group2: np.ndarray,
	*,
	name_group1: str,
	name_group2: str,
	test_type: str,
	include_graph: bool,
	bootstrap_samples: int,
	) -> tuple[pd.DataFrame, plt.Figure \| None]:
	"""
	Bartlett or Levene test + optional bootstrap variance plots.
	"""
	if group1.size == 0 or group2.size == 0:
	raise ValueError("One or both groups are empty after filtering.")

	if test_type == "Bartlett":
	stat, p = bartlett(group1, group2)
	method = "Bartlett's test"
	elif test_type == "Levene":
	stat, p = levene(group1, group2, center="mean")
	method = "Levene's test"
	else:
	raise ValueError("Invalid test type selected.")

	var1 = float(np.var(group1, ddof=1))
	var2 = float(np.var(group2, ddof=1))

	df_output = pd.DataFrame(
	{
	"Test": [method],
	"Statistic": [stat],
	"p-value": [p],
	f"Var({name_group1})": [var1],
	f"Var({name_group2})": [var2],
	}
	)

	fig = None
	if include_graph:
	fig = plot_variance_distribution(
	p=p,
	group1=group1,
	name_group1=name_group1,
	var1=var1,
	group2=group2,
	name_group2=name_group2,
	var2=var2,
	method=method,
	bootstrap_samples=bootstrap_samples,
	)

	return df_output, fig


	# ============================================================
	# One-way ANOVA
	# ============================================================


	def one_way_anova_plot(
	data_group: pd.DataFrame,
	numeric_col: str,
	cat_col: str,
	df_output: pd.DataFrame,
	):
	"""
	KDE plot of group distributions with F/p annotation.
	"""
	p_val = df_output["p-unc"].values[0]

	groups = sorted(data_group[cat_col].dropna().unique())
	palette = sns.color_palette("tab10", n_colors=len(groups))
	group_color_map = dict(zip(groups, palette))

	fig, ax = plt.subplots(figsize=(8, 5))

	for group in groups:
	subset = data_group[data_group[cat_col] == group][numeric_col].dropna()
	sns.kdeplot(
	subset,
	fill=True,
	common_norm=False,
	color=group_color_map[group],
	alpha=0.5,
	linewidth=1,
	label=str(group),
	ax=ax,
	)

	overall_mean = data_group[numeric_col].mean()
	ax.axvline(
	overall_mean,
	color="black",
	linestyle=":",
	linewidth=1.2,
	label="Overall mean",
	)

	group_means = data_group.groupby(cat_col)[numeric_col].mean()
	for group, mean_val in group_means.items():
	ax.axvline(
	mean_val,
	color=group_color_map[group],
	linestyle="--",
	linewidth=1.5,
	label=f"{group} mean",
	)

	ax.text(
	0.98,
	0.95,
	f"p = {p_val:.3f}",
	transform=ax.transAxes,
	ha="right",
	va="top",
	bbox=dict(boxstyle="round", facecolor="white", alpha=0.7),
	fontsize=11,
	)

	ax.set_title("Group Distributions for One-way ANOVA", fontsize=14)
	ax.set_xlabel(numeric_col, fontsize=12)
	ax.set_ylabel("Density", fontsize=12)
	ax.grid(True, linestyle="--", alpha=0.3)
	ax.legend(title=cat_col)
	plt.tight_layout()

	return fig


	def one_way_anova(
	data_group: pd.DataFrame,
	*,
	numeric_col: str,
	cat_col: str,
	) -> tuple[pd.DataFrame, plt.Figure]:
	"""
	One-way ANOVA (pingouin.anova) + distribution plot.
	"""
	if data_group.empty:
	raise ValueError("Dataset is empty after filtering.")

	df_output = pg.anova(
	dv=numeric_col,
	between=cat_col,
	data=data_group,
	detailed=True,
	)

	fig = one_way_anova_plot(
	data_group=data_group,
	numeric_col=numeric_col,
	cat_col=cat_col,
	df_output=df_output,
	)

	return df_output, fig