File size: 2,555 Bytes
e057d08
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
"""
Statistical Tests
=================

Statistical significance testing for model comparisons.

Implements:
- Friedman test (non-parametric ANOVA)
- Nemenyi post-hoc test
- Critical difference calculation

Author: UW MSIM Team
Date: November 2025
"""

import numpy as np
import pandas as pd
from scipy import stats
from typing import Dict, Tuple
import logging

logger = logging.getLogger(__name__)


def friedman_test(results_df: pd.DataFrame) -> Dict:
    """
    Friedman test for comparing multiple models.

    Parameters
    ----------
    results_df : pd.DataFrame
        Rows = datasets, columns = models, values = metric scores

    Returns
    -------
    results : dict
        Test statistic, p-value, and significance
    """
    # Rank models for each dataset (higher is better)
    ranks = results_df.rank(axis=1, ascending=False)

    # Friedman test
    stat, p_value = stats.friedmanchisquare(*[ranks[col] for col in ranks.columns])

    logger.info(f"Friedman Test: statistic={stat:.4f}, p-value={p_value:.4e}")

    return {
        'statistic': stat,
        'p_value': p_value,
        'significant': p_value < 0.05,
        'avg_ranks': ranks.mean().to_dict()
    }


def nemenyi_post_hoc(results_df: pd.DataFrame) -> pd.DataFrame:
    """
    Nemenyi post-hoc test (pairwise comparisons).

    Parameters
    ----------
    results_df : pd.DataFrame
        Rows = datasets, columns = models, values = metric scores

    Returns
    -------
    p_values : pd.DataFrame
        Pairwise p-values
    """
    try:
        import scikit_posthocs as sp
        ranks = results_df.rank(axis=1, ascending=False)
        p_values = sp.posthoc_nemenyi_friedman(ranks.T)
        return p_values
    except ImportError:
        logger.error("scikit-posthocs not installed. Install with: pip install scikit-posthocs")
        raise


def critical_difference(
    n_datasets: int,
    n_models: int,
    alpha: float = 0.05
) -> float:
    """
    Calculate critical difference for CD diagrams.

    Parameters
    ----------
    n_datasets : int
        Number of datasets
    n_models : int
        Number of models
    alpha : float
        Significance level

    Returns
    -------
    cd : float
        Critical difference value
    """
    # Critical value from Nemenyi distribution
    # Approximation using normal distribution
    q_alpha = stats.norm.ppf(1 - alpha / 2)

    cd = q_alpha * np.sqrt((n_models * (n_models + 1)) / (6 * n_datasets))

    logger.info(f"Critical Difference: {cd:.4f} (alpha={alpha})")

    return cd