from __future__ import annotations import importlib from pathlib import Path import pandas as pd import pytest def _missingness_groups(): return importlib.import_module("sepsis_mcp.missingness_groups") def _selection_frame() -> pd.DataFrame: return pd.DataFrame( { "patient_id": [f"p{index:03d}" for index in range(8)], "target_in_6h": [1, 0, 1, 0, 1, 0, 1, 0], "global_missing_rate": [0.10, 0.12, 0.14, 0.16, 0.70, 0.72, 0.74, 0.76], "Lactate_missing_rate": [0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0], "WBC_missing_rate": [0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0], "Creatinine_missing_rate": [1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0], } ) def test_coverage_gap_variable_chooses_variable_with_largest_selection_split_gap() -> None: mg = _missingness_groups() frame = _selection_frame() result = mg.build_missingness_groups( selection_frame=frame, calibration_frame=frame.iloc[:4].copy(), test_frame=frame.iloc[4:].copy(), strategy="coverage_gap_variable", candidate_missing_variables=["Lactate", "WBC", "Creatinine"], min_group_fraction=0.25, ) assert result.metadata["strategy"] == "coverage_gap_variable" assert result.metadata["selected_variable"] == "Lactate" assert result.group_labels[0] == "lactate_ever_observed" assert result.group_labels[1] == "lactate_never_observed" def test_variable_grouping_uses_ever_observed_vs_never_observed() -> None: mg = _missingness_groups() frame = _selection_frame() result = mg.build_missingness_groups( selection_frame=frame, calibration_frame=frame.iloc[:4].copy(), test_frame=frame.iloc[4:].copy(), strategy="coverage_gap_variable", candidate_missing_variables=["Lactate"], min_group_fraction=0.25, ) assert list(result.selection_groups["group_label"].unique()) == [ "lactate_ever_observed", "lactate_never_observed", ] def test_minimum_support_filtering_rejects_degenerate_candidates() -> None: mg = _missingness_groups() frame = _selection_frame() frame["RareLab_missing_rate"] = [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0] with pytest.raises(ValueError, match="minimum support"): mg.build_missingness_groups( selection_frame=frame, calibration_frame=frame.iloc[:4].copy(), test_frame=frame.iloc[4:].copy(), strategy="coverage_gap_variable", candidate_missing_variables=["RareLab"], min_group_fraction=0.25, ) def test_mask_cluster_returns_cluster_labels_and_k_grid_diagnostics() -> None: mg = _missingness_groups() frame = _selection_frame() result = mg.build_missingness_groups( selection_frame=frame, calibration_frame=frame.iloc[:4].copy(), test_frame=frame.iloc[4:].copy(), strategy="mask_cluster", mask_cluster_k_grid=[2, 3, 4, 5], min_group_fraction=0.25, ) assert set(result.group_labels.values()) == {"cluster_0", "cluster_1"} assert set(result.metadata["mask_cluster_diagnostics"]) == {2, 3, 4, 5} for diagnostics in result.metadata["mask_cluster_diagnostics"].values(): assert {"silhouette", "min_cluster_size", "selection_gap"} <= set(diagnostics)