File size: 3,377 Bytes
32f5a65
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
from __future__ import annotations

import importlib
from pathlib import Path

import pandas as pd
import pytest


def _missingness_groups():
    return importlib.import_module("sepsis_mcp.missingness_groups")


def _selection_frame() -> pd.DataFrame:
    return pd.DataFrame(
        {
            "patient_id": [f"p{index:03d}" for index in range(8)],
            "target_in_6h": [1, 0, 1, 0, 1, 0, 1, 0],
            "global_missing_rate": [0.10, 0.12, 0.14, 0.16, 0.70, 0.72, 0.74, 0.76],
            "Lactate_missing_rate": [0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0],
            "WBC_missing_rate": [0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0],
            "Creatinine_missing_rate": [1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0],
        }
    )


def test_coverage_gap_variable_chooses_variable_with_largest_selection_split_gap() -> None:
    mg = _missingness_groups()
    frame = _selection_frame()

    result = mg.build_missingness_groups(
        selection_frame=frame,
        calibration_frame=frame.iloc[:4].copy(),
        test_frame=frame.iloc[4:].copy(),
        strategy="coverage_gap_variable",
        candidate_missing_variables=["Lactate", "WBC", "Creatinine"],
        min_group_fraction=0.25,
    )

    assert result.metadata["strategy"] == "coverage_gap_variable"
    assert result.metadata["selected_variable"] == "Lactate"
    assert result.group_labels[0] == "lactate_ever_observed"
    assert result.group_labels[1] == "lactate_never_observed"


def test_variable_grouping_uses_ever_observed_vs_never_observed() -> None:
    mg = _missingness_groups()
    frame = _selection_frame()

    result = mg.build_missingness_groups(
        selection_frame=frame,
        calibration_frame=frame.iloc[:4].copy(),
        test_frame=frame.iloc[4:].copy(),
        strategy="coverage_gap_variable",
        candidate_missing_variables=["Lactate"],
        min_group_fraction=0.25,
    )

    assert list(result.selection_groups["group_label"].unique()) == [
        "lactate_ever_observed",
        "lactate_never_observed",
    ]


def test_minimum_support_filtering_rejects_degenerate_candidates() -> None:
    mg = _missingness_groups()
    frame = _selection_frame()
    frame["RareLab_missing_rate"] = [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0]

    with pytest.raises(ValueError, match="minimum support"):
        mg.build_missingness_groups(
            selection_frame=frame,
            calibration_frame=frame.iloc[:4].copy(),
            test_frame=frame.iloc[4:].copy(),
            strategy="coverage_gap_variable",
            candidate_missing_variables=["RareLab"],
            min_group_fraction=0.25,
        )


def test_mask_cluster_returns_cluster_labels_and_k_grid_diagnostics() -> None:
    mg = _missingness_groups()
    frame = _selection_frame()

    result = mg.build_missingness_groups(
        selection_frame=frame,
        calibration_frame=frame.iloc[:4].copy(),
        test_frame=frame.iloc[4:].copy(),
        strategy="mask_cluster",
        mask_cluster_k_grid=[2, 3, 4, 5],
        min_group_fraction=0.25,
    )

    assert set(result.group_labels.values()) == {"cluster_0", "cluster_1"}
    assert set(result.metadata["mask_cluster_diagnostics"]) == {2, 3, 4, 5}
    for diagnostics in result.metadata["mask_cluster_diagnostics"].values():
        assert {"silhouette", "min_cluster_size", "selection_gap"} <= set(diagnostics)