Spaces:
Running
Running
File size: 2,555 Bytes
e057d08 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 | """
Statistical Tests
=================
Statistical significance testing for model comparisons.
Implements:
- Friedman test (non-parametric ANOVA)
- Nemenyi post-hoc test
- Critical difference calculation
Author: UW MSIM Team
Date: November 2025
"""
import numpy as np
import pandas as pd
from scipy import stats
from typing import Dict, Tuple
import logging
logger = logging.getLogger(__name__)
def friedman_test(results_df: pd.DataFrame) -> Dict:
"""
Friedman test for comparing multiple models.
Parameters
----------
results_df : pd.DataFrame
Rows = datasets, columns = models, values = metric scores
Returns
-------
results : dict
Test statistic, p-value, and significance
"""
# Rank models for each dataset (higher is better)
ranks = results_df.rank(axis=1, ascending=False)
# Friedman test
stat, p_value = stats.friedmanchisquare(*[ranks[col] for col in ranks.columns])
logger.info(f"Friedman Test: statistic={stat:.4f}, p-value={p_value:.4e}")
return {
'statistic': stat,
'p_value': p_value,
'significant': p_value < 0.05,
'avg_ranks': ranks.mean().to_dict()
}
def nemenyi_post_hoc(results_df: pd.DataFrame) -> pd.DataFrame:
"""
Nemenyi post-hoc test (pairwise comparisons).
Parameters
----------
results_df : pd.DataFrame
Rows = datasets, columns = models, values = metric scores
Returns
-------
p_values : pd.DataFrame
Pairwise p-values
"""
try:
import scikit_posthocs as sp
ranks = results_df.rank(axis=1, ascending=False)
p_values = sp.posthoc_nemenyi_friedman(ranks.T)
return p_values
except ImportError:
logger.error("scikit-posthocs not installed. Install with: pip install scikit-posthocs")
raise
def critical_difference(
n_datasets: int,
n_models: int,
alpha: float = 0.05
) -> float:
"""
Calculate critical difference for CD diagrams.
Parameters
----------
n_datasets : int
Number of datasets
n_models : int
Number of models
alpha : float
Significance level
Returns
-------
cd : float
Critical difference value
"""
# Critical value from Nemenyi distribution
# Approximation using normal distribution
q_alpha = stats.norm.ppf(1 - alpha / 2)
cd = q_alpha * np.sqrt((n_models * (n_models + 1)) / (6 * n_datasets))
logger.info(f"Critical Difference: {cd:.4f} (alpha={alpha})")
return cd
|