| from __future__ import annotations |
|
|
| import importlib |
| from pathlib import Path |
|
|
| import pandas as pd |
| import pytest |
|
|
|
|
| def _missingness_groups(): |
| return importlib.import_module("sepsis_mcp.missingness_groups") |
|
|
|
|
| def _selection_frame() -> pd.DataFrame: |
| return pd.DataFrame( |
| { |
| "patient_id": [f"p{index:03d}" for index in range(8)], |
| "target_in_6h": [1, 0, 1, 0, 1, 0, 1, 0], |
| "global_missing_rate": [0.10, 0.12, 0.14, 0.16, 0.70, 0.72, 0.74, 0.76], |
| "Lactate_missing_rate": [0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0], |
| "WBC_missing_rate": [0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0], |
| "Creatinine_missing_rate": [1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0], |
| } |
| ) |
|
|
|
|
| def test_coverage_gap_variable_chooses_variable_with_largest_selection_split_gap() -> None: |
| mg = _missingness_groups() |
| frame = _selection_frame() |
|
|
| result = mg.build_missingness_groups( |
| selection_frame=frame, |
| calibration_frame=frame.iloc[:4].copy(), |
| test_frame=frame.iloc[4:].copy(), |
| strategy="coverage_gap_variable", |
| candidate_missing_variables=["Lactate", "WBC", "Creatinine"], |
| min_group_fraction=0.25, |
| ) |
|
|
| assert result.metadata["strategy"] == "coverage_gap_variable" |
| assert result.metadata["selected_variable"] == "Lactate" |
| assert result.group_labels[0] == "lactate_ever_observed" |
| assert result.group_labels[1] == "lactate_never_observed" |
|
|
|
|
| def test_variable_grouping_uses_ever_observed_vs_never_observed() -> None: |
| mg = _missingness_groups() |
| frame = _selection_frame() |
|
|
| result = mg.build_missingness_groups( |
| selection_frame=frame, |
| calibration_frame=frame.iloc[:4].copy(), |
| test_frame=frame.iloc[4:].copy(), |
| strategy="coverage_gap_variable", |
| candidate_missing_variables=["Lactate"], |
| min_group_fraction=0.25, |
| ) |
|
|
| assert list(result.selection_groups["group_label"].unique()) == [ |
| "lactate_ever_observed", |
| "lactate_never_observed", |
| ] |
|
|
|
|
| def test_minimum_support_filtering_rejects_degenerate_candidates() -> None: |
| mg = _missingness_groups() |
| frame = _selection_frame() |
| frame["RareLab_missing_rate"] = [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0] |
|
|
| with pytest.raises(ValueError, match="minimum support"): |
| mg.build_missingness_groups( |
| selection_frame=frame, |
| calibration_frame=frame.iloc[:4].copy(), |
| test_frame=frame.iloc[4:].copy(), |
| strategy="coverage_gap_variable", |
| candidate_missing_variables=["RareLab"], |
| min_group_fraction=0.25, |
| ) |
|
|
|
|
| def test_mask_cluster_returns_cluster_labels_and_k_grid_diagnostics() -> None: |
| mg = _missingness_groups() |
| frame = _selection_frame() |
|
|
| result = mg.build_missingness_groups( |
| selection_frame=frame, |
| calibration_frame=frame.iloc[:4].copy(), |
| test_frame=frame.iloc[4:].copy(), |
| strategy="mask_cluster", |
| mask_cluster_k_grid=[2, 3, 4, 5], |
| min_group_fraction=0.25, |
| ) |
|
|
| assert set(result.group_labels.values()) == {"cluster_0", "cluster_1"} |
| assert set(result.metadata["mask_cluster_diagnostics"]) == {2, 3, 4, 5} |
| for diagnostics in result.metadata["mask_cluster_diagnostics"].values(): |
| assert {"silhouette", "min_cluster_size", "selection_gap"} <= set(diagnostics) |
|
|