Spaces:
Running
Running
| """ | |
| Statistical Tests | |
| ================= | |
| Statistical significance testing for model comparisons. | |
| Implements: | |
| - Friedman test (non-parametric ANOVA) | |
| - Nemenyi post-hoc test | |
| - Critical difference calculation | |
| Author: UW MSIM Team | |
| Date: November 2025 | |
| """ | |
| import numpy as np | |
| import pandas as pd | |
| from scipy import stats | |
| from typing import Dict, Tuple | |
| import logging | |
| logger = logging.getLogger(__name__) | |
| def friedman_test(results_df: pd.DataFrame) -> Dict: | |
| """ | |
| Friedman test for comparing multiple models. | |
| Parameters | |
| ---------- | |
| results_df : pd.DataFrame | |
| Rows = datasets, columns = models, values = metric scores | |
| Returns | |
| ------- | |
| results : dict | |
| Test statistic, p-value, and significance | |
| """ | |
| # Rank models for each dataset (higher is better) | |
| ranks = results_df.rank(axis=1, ascending=False) | |
| # Friedman test | |
| stat, p_value = stats.friedmanchisquare(*[ranks[col] for col in ranks.columns]) | |
| logger.info(f"Friedman Test: statistic={stat:.4f}, p-value={p_value:.4e}") | |
| return { | |
| 'statistic': stat, | |
| 'p_value': p_value, | |
| 'significant': p_value < 0.05, | |
| 'avg_ranks': ranks.mean().to_dict() | |
| } | |
| def nemenyi_post_hoc(results_df: pd.DataFrame) -> pd.DataFrame: | |
| """ | |
| Nemenyi post-hoc test (pairwise comparisons). | |
| Parameters | |
| ---------- | |
| results_df : pd.DataFrame | |
| Rows = datasets, columns = models, values = metric scores | |
| Returns | |
| ------- | |
| p_values : pd.DataFrame | |
| Pairwise p-values | |
| """ | |
| try: | |
| import scikit_posthocs as sp | |
| ranks = results_df.rank(axis=1, ascending=False) | |
| p_values = sp.posthoc_nemenyi_friedman(ranks.T) | |
| return p_values | |
| except ImportError: | |
| logger.error("scikit-posthocs not installed. Install with: pip install scikit-posthocs") | |
| raise | |
| def critical_difference( | |
| n_datasets: int, | |
| n_models: int, | |
| alpha: float = 0.05 | |
| ) -> float: | |
| """ | |
| Calculate critical difference for CD diagrams. | |
| Parameters | |
| ---------- | |
| n_datasets : int | |
| Number of datasets | |
| n_models : int | |
| Number of models | |
| alpha : float | |
| Significance level | |
| Returns | |
| ------- | |
| cd : float | |
| Critical difference value | |
| """ | |
| # Critical value from Nemenyi distribution | |
| # Approximation using normal distribution | |
| q_alpha = stats.norm.ppf(1 - alpha / 2) | |
| cd = q_alpha * np.sqrt((n_models * (n_models + 1)) / (6 * n_datasets)) | |
| logger.info(f"Critical Difference: {cd:.4f} (alpha={alpha})") | |
| return cd | |