| from __future__ import annotations
|
|
|
| from typing import Iterable, Tuple
|
|
|
| import numpy as np
|
| import pandas as pd
|
|
|
| from core.hypothesis_tests import (
|
| one_sample_ttest,
|
| two_sample_ttest,
|
| variance_test,
|
| one_way_anova,
|
| )
|
|
|
| ROUND = 4
|
|
|
|
|
| def _round_table(table: pd.DataFrame, decimals: int = ROUND) -> pd.DataFrame:
|
| """Round only numeric columns of the result table."""
|
| if table is None:
|
| return table
|
| tbl = table.copy()
|
| num_cols = tbl.select_dtypes(include="number").columns
|
| if len(num_cols) > 0:
|
| tbl[num_cols] = tbl[num_cols].round(decimals)
|
| return tbl
|
|
|
| def _ensure_numeric_series(df: pd.DataFrame, column: str) -> np.ndarray:
|
| if df is None:
|
| raise ValueError("No dataset loaded.")
|
| if column not in df.columns:
|
| raise ValueError(f"Column '{column}' not found in the dataset.")
|
|
|
| series = df[column].dropna()
|
| if series.empty:
|
| raise ValueError("No valid data in the selected column.")
|
| return series.to_numpy()
|
|
|
|
|
| def _materialize_group(
|
| df: pd.DataFrame,
|
| numeric_col: str,
|
| cat_col: str | None,
|
| cat_vals: Iterable[str],
|
| ) -> np.ndarray:
|
| if cat_col is None:
|
| raise ValueError("No categorical column selected.")
|
|
|
| if cat_col not in df.columns:
|
| raise ValueError(f"Categorical column '{cat_col}' not found in the dataset.")
|
|
|
|
|
| if cat_vals is None:
|
| values = []
|
| else:
|
| values = list(cat_vals)
|
|
|
| if not values:
|
| raise ValueError(f"No categories selected for column '{cat_col}'.")
|
|
|
| cat_series = pd.Series(values).astype(df[cat_col].dtype)
|
| mask = df[cat_col].isin(cat_series)
|
| series = df.loc[mask, numeric_col].dropna()
|
|
|
| if series.empty:
|
| raise ValueError("One or more groups are empty after filtering.")
|
| return series.to_numpy()
|
|
|
|
|
| def run_hypothesis_testing(
|
| *,
|
| df: pd.DataFrame | None,
|
| numeric_col: str,
|
| hypo_test: str,
|
| mu0_text: str,
|
| alternative: str,
|
| include_graph: bool,
|
| bootstrap_samples: int,
|
| cat_col1: str | None,
|
| cat_vals1: list[str],
|
| name_group1: str,
|
| cat_col2: str | None,
|
| cat_vals2: list[str],
|
| name_group2: str,
|
| cat_col3: str | None,
|
| cat_vals3: list[str],
|
| plot_type: str,
|
| correction: bool,
|
| test_type: str,
|
| ) -> Tuple[pd.DataFrame, object | None]:
|
| """
|
| High-level dispatcher used by the Hypothesis Testing tab.
|
|
|
| Returns:
|
| (result_table, figure_or_none)
|
| """
|
| if df is None:
|
| raise ValueError("No dataset loaded.")
|
|
|
|
|
| _ = _ensure_numeric_series(df, numeric_col)
|
|
|
|
|
|
|
|
|
| if hypo_test == "One sample Student's t-test":
|
| if not mu0_text.strip():
|
| raise ValueError("μ₀ must be specified for the one-sample t-test.")
|
| try:
|
| mu0 = float(mu0_text)
|
| except Exception:
|
| raise ValueError("μ₀ must be a numeric value.")
|
|
|
| sample = df[numeric_col].dropna().to_numpy()
|
|
|
| table, fig = one_sample_ttest(
|
| sample=sample,
|
| mu0=mu0,
|
| alternative=alternative,
|
| numeric_col=numeric_col,
|
| bootstrap_samples=bootstrap_samples,
|
| include_graph=include_graph,
|
| )
|
| table = _round_table(table)
|
| return table, fig
|
|
|
|
|
|
|
|
|
| if hypo_test == "Two samples Student's t-test":
|
| group1 = _materialize_group(df, numeric_col, cat_col1, cat_vals1)
|
| group2 = _materialize_group(df, numeric_col, cat_col2, cat_vals2)
|
|
|
|
|
| name1 = name_group1 or "Group 1"
|
| name2 = name_group2 or "Group 2"
|
|
|
| table, fig = two_sample_ttest(
|
| group1=group1,
|
| group2=group2,
|
| numeric_col=numeric_col,
|
| name_group1=name1,
|
| name_group2=name2,
|
| alternative=alternative,
|
| correction=correction,
|
| plot_type=plot_type,
|
| bootstrap_samples=bootstrap_samples,
|
| include_graph=include_graph,
|
| )
|
| table = _round_table(table)
|
| return table, fig
|
|
|
|
|
|
|
|
|
| if hypo_test == "Equal variance between two groups":
|
| group1 = _materialize_group(df, numeric_col, cat_col1, cat_vals1)
|
| group2 = _materialize_group(df, numeric_col, cat_col2, cat_vals2)
|
|
|
| name1 = name_group1 or "Group 1"
|
| name2 = name_group2 or "Group 2"
|
|
|
| table, fig = variance_test(
|
| group1=group1,
|
| group2=group2,
|
| name_group1=name1,
|
| name_group2=name2,
|
| test_type=test_type,
|
| include_graph=include_graph,
|
| bootstrap_samples=bootstrap_samples,
|
| )
|
| table = _round_table(table)
|
| return table, fig
|
|
|
|
|
|
|
|
|
| if hypo_test == "One-way ANOVA":
|
| if cat_col3 is None:
|
| raise ValueError("A categorical column must be selected for ANOVA.")
|
|
|
| if cat_col3 not in df.columns:
|
| raise ValueError(
|
| f"Categorical column '{cat_col3}' not found in the dataset."
|
| )
|
|
|
| if not cat_vals3:
|
| raise ValueError("At least one category must be selected for ANOVA.")
|
|
|
| cat_series = pd.Series(cat_vals3).astype(df[cat_col3].dtype)
|
| data_group = df[df[cat_col3].isin(cat_series)][[numeric_col, cat_col3]].dropna()
|
|
|
| table, fig = one_way_anova(
|
| data_group=data_group,
|
| numeric_col=numeric_col,
|
| cat_col=cat_col3,
|
| )
|
| table = _round_table(table)
|
| return table, fig
|
|
|
|
|
|
|
|
|
| raise ValueError(f"Unknown hypothesis test: {hypo_test}")
|
|
|