File size: 2,755 Bytes
906e104
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
"""Statistical-analysis tests for the evaluator.

  1. Wilcoxon direction is correct for both lower-is-better and higher-is-
     better metrics, and Cohen's d is positive when DAHS wins.
  2. Nemenyi post-hoc returns mean ranks, a critical difference, and a
     pairwise matrix consistent with those ranks.
"""
from __future__ import annotations

import numpy as np
import pandas as pd

from src.evaluator import (
    METRIC_DIRECTIONS,
    _wilcoxon_for_metric,
    _nemenyi_pairwise,
)


def _synthetic_df(n_seeds: int = 30) -> pd.DataFrame:
    """DAHS dominates on tardiness, loses on throughput — synthetic fixture."""
    rng = np.random.default_rng(0)
    seeds = list(range(n_seeds))
    rows = []
    for s in seeds:
        rows.append({"seed": s, "method": "dahs_xgb",
                     "total_tardiness": 50 + rng.normal(0, 5),
                     "throughput":      75 + rng.normal(0, 2)})
        rows.append({"seed": s, "method": "fifo",
                     "total_tardiness": 200 + rng.normal(0, 10),
                     "throughput":      90 + rng.normal(0, 2)})
        rows.append({"seed": s, "method": "atc",
                     "total_tardiness": 120 + rng.normal(0, 10),
                     "throughput":      85 + rng.normal(0, 2)})
    return pd.DataFrame(rows)


def test_wilcoxon_lower_metric_dahs_wins():
    df = _synthetic_df()
    pivot = df.pivot_table(index="seed", columns="method", values="total_tardiness")
    avail = list(pivot.columns)
    rows = _wilcoxon_for_metric(pivot, avail, "dahs_xgb",
                                "total_tardiness", METRIC_DIRECTIONS["total_tardiness"])
    assert rows
    for r in rows:
        assert r["p_value"] < 1e-3, r
        assert r["cohens_d"] > 0, r
        assert r["significant_holm"] is True, r


def test_wilcoxon_higher_metric_dahs_loses():
    df = _synthetic_df()
    pivot = df.pivot_table(index="seed", columns="method", values="throughput")
    avail = list(pivot.columns)
    rows = _wilcoxon_for_metric(pivot, avail, "dahs_xgb",
                                "throughput", METRIC_DIRECTIONS["throughput"])
    for r in rows:
        assert r["cohens_d"] < 0, r
        assert r["p_value"] > 0.05, r


def test_nemenyi_returns_consistent_ranks():
    df = _synthetic_df(n_seeds=40)
    pivot = df.pivot_table(index="seed", columns="method", values="total_tardiness").dropna()
    avail = list(pivot.columns)
    out = _nemenyi_pairwise(pivot, avail)
    assert out["available"]
    ranks = out["mean_ranks"]
    assert ranks["dahs_xgb"] < ranks["atc"] < ranks["fifo"]
    assert out["critical_difference"] > 0.0
    for cell in out["pairwise"]:
        if cell["rank_diff"] > out["critical_difference"]:
            assert cell["significant"], cell