Spaces:

Vittal-M
/

Disruption-System

Sleeping

App Files Files Community

Disruption-System / tests /test_evaluator_stats.py

Vittal-M

Upload 66 files

906e104 verified about 1 month ago

raw

history blame contribute delete

2.76 kB

	"""Statistical-analysis tests for the evaluator.

	1. Wilcoxon direction is correct for both lower-is-better and higher-is-
	better metrics, and Cohen's d is positive when DAHS wins.
	2. Nemenyi post-hoc returns mean ranks, a critical difference, and a
	pairwise matrix consistent with those ranks.
	"""
	from __future__ import annotations

	import numpy as np
	import pandas as pd

	from src.evaluator import (
	METRIC_DIRECTIONS,
	_wilcoxon_for_metric,
	_nemenyi_pairwise,
	)


	def _synthetic_df(n_seeds: int = 30) -> pd.DataFrame:
	"""DAHS dominates on tardiness, loses on throughput — synthetic fixture."""
	rng = np.random.default_rng(0)
	seeds = list(range(n_seeds))
	rows = []
	for s in seeds:
	rows.append({"seed": s, "method": "dahs_xgb",
	"total_tardiness": 50 + rng.normal(0, 5),
	"throughput": 75 + rng.normal(0, 2)})
	rows.append({"seed": s, "method": "fifo",
	"total_tardiness": 200 + rng.normal(0, 10),
	"throughput": 90 + rng.normal(0, 2)})
	rows.append({"seed": s, "method": "atc",
	"total_tardiness": 120 + rng.normal(0, 10),
	"throughput": 85 + rng.normal(0, 2)})
	return pd.DataFrame(rows)


	def test_wilcoxon_lower_metric_dahs_wins():
	df = _synthetic_df()
	pivot = df.pivot_table(index="seed", columns="method", values="total_tardiness")
	avail = list(pivot.columns)
	rows = _wilcoxon_for_metric(pivot, avail, "dahs_xgb",
	"total_tardiness", METRIC_DIRECTIONS["total_tardiness"])
	assert rows
	for r in rows:
	assert r["p_value"] < 1e-3, r
	assert r["cohens_d"] > 0, r
	assert r["significant_holm"] is True, r


	def test_wilcoxon_higher_metric_dahs_loses():
	df = _synthetic_df()
	pivot = df.pivot_table(index="seed", columns="method", values="throughput")
	avail = list(pivot.columns)
	rows = _wilcoxon_for_metric(pivot, avail, "dahs_xgb",
	"throughput", METRIC_DIRECTIONS["throughput"])
	for r in rows:
	assert r["cohens_d"] < 0, r
	assert r["p_value"] > 0.05, r


	def test_nemenyi_returns_consistent_ranks():
	df = _synthetic_df(n_seeds=40)
	pivot = df.pivot_table(index="seed", columns="method", values="total_tardiness").dropna()
	avail = list(pivot.columns)
	out = _nemenyi_pairwise(pivot, avail)
	assert out["available"]
	ranks = out["mean_ranks"]
	assert ranks["dahs_xgb"] < ranks["atc"] < ranks["fifo"]
	assert out["critical_difference"] > 0.0
	for cell in out["pairwise"]:
	if cell["rank_diff"] > out["critical_difference"]:
	assert cell["significant"], cell