misscp / tests /test_missingness_groups.py
Anonymous
Initial anonymous MissCP release
32f5a65
from __future__ import annotations
import importlib
from pathlib import Path
import pandas as pd
import pytest
def _missingness_groups():
return importlib.import_module("sepsis_mcp.missingness_groups")
def _selection_frame() -> pd.DataFrame:
return pd.DataFrame(
{
"patient_id": [f"p{index:03d}" for index in range(8)],
"target_in_6h": [1, 0, 1, 0, 1, 0, 1, 0],
"global_missing_rate": [0.10, 0.12, 0.14, 0.16, 0.70, 0.72, 0.74, 0.76],
"Lactate_missing_rate": [0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0],
"WBC_missing_rate": [0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0],
"Creatinine_missing_rate": [1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0],
}
)
def test_coverage_gap_variable_chooses_variable_with_largest_selection_split_gap() -> None:
mg = _missingness_groups()
frame = _selection_frame()
result = mg.build_missingness_groups(
selection_frame=frame,
calibration_frame=frame.iloc[:4].copy(),
test_frame=frame.iloc[4:].copy(),
strategy="coverage_gap_variable",
candidate_missing_variables=["Lactate", "WBC", "Creatinine"],
min_group_fraction=0.25,
)
assert result.metadata["strategy"] == "coverage_gap_variable"
assert result.metadata["selected_variable"] == "Lactate"
assert result.group_labels[0] == "lactate_ever_observed"
assert result.group_labels[1] == "lactate_never_observed"
def test_variable_grouping_uses_ever_observed_vs_never_observed() -> None:
mg = _missingness_groups()
frame = _selection_frame()
result = mg.build_missingness_groups(
selection_frame=frame,
calibration_frame=frame.iloc[:4].copy(),
test_frame=frame.iloc[4:].copy(),
strategy="coverage_gap_variable",
candidate_missing_variables=["Lactate"],
min_group_fraction=0.25,
)
assert list(result.selection_groups["group_label"].unique()) == [
"lactate_ever_observed",
"lactate_never_observed",
]
def test_minimum_support_filtering_rejects_degenerate_candidates() -> None:
mg = _missingness_groups()
frame = _selection_frame()
frame["RareLab_missing_rate"] = [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0]
with pytest.raises(ValueError, match="minimum support"):
mg.build_missingness_groups(
selection_frame=frame,
calibration_frame=frame.iloc[:4].copy(),
test_frame=frame.iloc[4:].copy(),
strategy="coverage_gap_variable",
candidate_missing_variables=["RareLab"],
min_group_fraction=0.25,
)
def test_mask_cluster_returns_cluster_labels_and_k_grid_diagnostics() -> None:
mg = _missingness_groups()
frame = _selection_frame()
result = mg.build_missingness_groups(
selection_frame=frame,
calibration_frame=frame.iloc[:4].copy(),
test_frame=frame.iloc[4:].copy(),
strategy="mask_cluster",
mask_cluster_k_grid=[2, 3, 4, 5],
min_group_fraction=0.25,
)
assert set(result.group_labels.values()) == {"cluster_0", "cluster_1"}
assert set(result.metadata["mask_cluster_diagnostics"]) == {2, 3, 4, 5}
for diagnostics in result.metadata["mask_cluster_diagnostics"].values():
assert {"silhouette", "min_cluster_size", "selection_gap"} <= set(diagnostics)