misscp / tests /test_coverage_asymmetry_analysis.py
Anonymous
Initial anonymous MissCP release
32f5a65
from __future__ import annotations
import pytest
import pandas as pd
def test_compute_worst_asymmetry_from_subgroups_splits_under_and_over() -> None:
from sepsis_mcp.coverage_asymmetry_analysis import compute_worst_asymmetry_from_subgroups
frame = pd.DataFrame(
{
"dataset": ["demo"] * 4,
"method": ["standard"] * 4,
"seed": [0] * 4,
"coverage": [0.82, 0.93, 0.88, 0.91],
"target_coverage": [0.9] * 4,
"average_set_size": [0.9, 1.0, 1.1, 1.2],
"group_label": ["a", "b", "c", "d"],
}
)
result = compute_worst_asymmetry_from_subgroups(
frame,
group_columns=["dataset", "method", "seed"],
)
assert len(result) == 1
row = result.iloc[0]
assert row["worst_under_coverage"] == pytest.approx(0.08)
assert row["worst_over_coverage"] == pytest.approx(0.03)
assert row["average_absolute_gap"] == pytest.approx((0.08 + 0.03 + 0.02 + 0.01) / 4)
assert row["coverage_variance"] == pytest.approx(pd.Series([0.82, 0.93, 0.88, 0.91]).var(ddof=0))
def test_build_full_subgroup_summary_aggregates_coverage_count_and_set_size() -> None:
from sepsis_mcp.coverage_asymmetry_analysis import build_full_subgroup_summary
frame = pd.DataFrame(
{
"dataset": ["demo"] * 4,
"model_type": ["xgboost"] * 4,
"method": ["standard", "standard", "missingness_aware", "missingness_aware"],
"group_label": ["low", "high", "low", "high"],
"count": [10, 20, 12, 18],
"coverage": [0.84, 0.93, 0.89, 0.91],
"coverage_gap": [-0.06, 0.03, -0.01, 0.01],
"average_set_size": [0.92, 1.01, 0.95, 1.03],
}
)
result = build_full_subgroup_summary(
frame,
group_columns=["dataset", "model_type", "method", "group_label"],
)
assert {"count_mean", "coverage_mean", "average_set_size_mean"} <= set(result.columns)
low_standard = result[(result["method"] == "standard") & (result["group_label"] == "low")].iloc[0]
assert low_standard["count_mean"] == pytest.approx(10)
assert low_standard["coverage_mean"] == pytest.approx(0.84)
assert low_standard["average_set_size_mean"] == pytest.approx(0.92)
def test_run_coverage_asymmetry_analysis_writes_full_decomposition_summary(tmp_path) -> None:
from sepsis_mcp.coverage_asymmetry_analysis import CoverageAsymmetryConfig, run_coverage_asymmetry_analysis
gossis_base = tmp_path / "gossis"
mimic_base = tmp_path / "mimic"
output_dir = tmp_path / "out"
for model_dir in ("logistic_regression", "xgboost", "mlp"):
model_path = gossis_base / model_dir
model_path.mkdir(parents=True)
pd.DataFrame(
[
{
"model_type": model_dir,
"method": "standard",
"random_state": 0,
"group_label": "low",
"count": 10,
"coverage": 0.82,
"coverage_gap": -0.08,
"average_set_size": 0.95,
"missingness_grouping_strategy": "coverage_gap_variable",
},
{
"model_type": model_dir,
"method": "standard",
"random_state": 0,
"group_label": "high",
"count": 20,
"coverage": 0.93,
"coverage_gap": 0.03,
"average_set_size": 1.02,
"missingness_grouping_strategy": "coverage_gap_variable",
},
]
).to_csv(model_path / "subgroup_summary.csv", index=False)
mimic_base.mkdir(parents=True)
pd.DataFrame(
[
{
"model_type": "xgboost",
"method": "standard",
"split_id": "split0",
"group_label": "low",
"count": 12,
"coverage": 0.84,
"coverage_gap": -0.06,
"average_set_size": 0.91,
},
{
"model_type": "xgboost",
"method": "standard",
"split_id": "split0",
"group_label": "high",
"count": 18,
"coverage": 0.91,
"coverage_gap": 0.01,
"average_set_size": 1.01,
},
]
).to_csv(mimic_base / "subgroup_runs.csv", index=False)
paths = run_coverage_asymmetry_analysis(
CoverageAsymmetryConfig(
gossis_base_dir=gossis_base,
mimic_base_dirs=(mimic_base,),
output_dir=output_dir,
)
)
summary = pd.read_csv(paths["summary"])
subgroup_summary = pd.read_csv(paths["subgroup_summary"])
assert {"average_absolute_gap_mean", "coverage_variance_mean"} <= set(summary.columns)
assert {"count_mean", "coverage_mean", "average_set_size_mean"} <= set(subgroup_summary.columns)