File size: 8,933 Bytes
32f5a65
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
from __future__ import annotations

import json
from pathlib import Path

import pandas as pd


def test_run_appendix_phase2_analysis_writes_expected_outputs(tmp_path: Path) -> None:
    from sepsis_mcp.appendix_phase2_analysis import AppendixPhase2Config, run_appendix_phase2_analysis

    alpha005 = tmp_path / "alpha005"
    alpha010 = tmp_path / "alpha010"
    grouping = tmp_path / "grouping"
    selection = tmp_path / "selection"
    output_dir = tmp_path / "phase2"
    for path in (alpha005, alpha010, grouping, selection):
        path.mkdir(parents=True, exist_ok=True)

    repeated_alpha = pd.DataFrame(
        [
            {
                "experiment": "hospital_disjoint",
                "method": "standard",
                "min_hospital_admissions": 500,
                "alpha": 0.05,
                "selection_fraction": 0.1,
                "model_type": "xgboost",
                "missingness_grouping_strategy": "coverage_gap_variable",
                "mask_strategy": "none",
                "mask_rate": 0.0,
                "selective_feature_group": "labs",
                "weighted_shrinkage_lambda": 0.5,
                "run_count": 2,
                "empirical_coverage_mean": 0.95,
                "empirical_coverage_std": 0.01,
                "max_group_coverage_gap_mean": 0.04,
                "max_group_coverage_gap_std": 0.01,
                "average_set_size_mean": 1.1,
                "average_set_size_std": 0.1,
                "worst_hospital_coverage_mean": 0.88,
                "worst_hospital_coverage_std": 0.02,
                "wcr": 0.82,
            },
            {
                "experiment": "hospital_disjoint",
                "method": "missingness_aware",
                "min_hospital_admissions": 500,
                "alpha": 0.05,
                "selection_fraction": 0.1,
                "model_type": "xgboost",
                "missingness_grouping_strategy": "coverage_gap_variable",
                "mask_strategy": "none",
                "mask_rate": 0.0,
                "selective_feature_group": "labs",
                "weighted_shrinkage_lambda": 0.5,
                "run_count": 2,
                "empirical_coverage_mean": 0.95,
                "empirical_coverage_std": 0.01,
                "max_group_coverage_gap_mean": 0.02,
                "max_group_coverage_gap_std": 0.01,
                "average_set_size_mean": 1.12,
                "average_set_size_std": 0.1,
                "worst_hospital_coverage_mean": 0.90,
                "worst_hospital_coverage_std": 0.02,
                "wcr": 0.90,
            },
        ]
    )
    repeated_alpha.to_csv(alpha005 / "repeated_summary.csv", index=False)
    repeated_alpha.assign(alpha=0.10).to_csv(alpha010 / "repeated_summary.csv", index=False)

    grouping_repeated = pd.DataFrame(
        [
            {
                "experiment": "hospital_disjoint",
                "method": "standard",
                "min_hospital_admissions": 500,
                "alpha": 0.1,
                "selection_fraction": 0.1,
                "model_type": "xgboost",
                "missingness_grouping_strategy": "coverage_gap_variable",
                "mask_strategy": "none",
                "mask_rate": 0.0,
                "selective_feature_group": "labs",
                "weighted_shrinkage_lambda": 0.5,
                "run_count": 2,
                "empirical_coverage_mean": 0.90,
                "max_group_coverage_gap_mean": 0.05,
                "average_set_size_mean": 0.95,
                "worst_hospital_coverage_mean": 0.84,
                "wcr": 0.80,
            },
            {
                "experiment": "hospital_disjoint",
                "method": "missingness_aware",
                "min_hospital_admissions": 500,
                "alpha": 0.1,
                "selection_fraction": 0.1,
                "model_type": "xgboost",
                "missingness_grouping_strategy": "coverage_gap_variable",
                "mask_strategy": "none",
                "mask_rate": 0.0,
                "selective_feature_group": "labs",
                "weighted_shrinkage_lambda": 0.5,
                "run_count": 2,
                "empirical_coverage_mean": 0.90,
                "max_group_coverage_gap_mean": 0.02,
                "average_set_size_mean": 0.97,
                "worst_hospital_coverage_mean": 0.88,
                "wcr": 0.88,
            },
        ]
    )
    grouping_repeated.to_csv(grouping / "repeated_summary.csv", index=False)

    selected_variable_stability = pd.DataFrame(
        [
            {
                "experiment": "hospital_disjoint",
                "method": "missingness_aware",
                "min_hospital_admissions": 500,
                "alpha": 0.1,
                "selection_fraction": 0.1,
                "model_type": "xgboost",
                "missingness_grouping_strategy": "coverage_gap_variable",
                "mask_strategy": "none",
                "mask_rate": 0.0,
                "selective_feature_group": "labs",
                "weighted_shrinkage_lambda": 0.5,
                "selected_variable": "lactate",
                "count": 2,
                "frequency": 0.5,
            },
            {
                "experiment": "hospital_disjoint",
                "method": "missingness_aware",
                "min_hospital_admissions": 500,
                "alpha": 0.1,
                "selection_fraction": 0.1,
                "model_type": "xgboost",
                "missingness_grouping_strategy": "coverage_gap_variable",
                "mask_strategy": "none",
                "mask_rate": 0.0,
                "selective_feature_group": "labs",
                "weighted_shrinkage_lambda": 0.5,
                "selected_variable": "pao2fio2ratio",
                "count": 1,
                "frequency": 0.25,
            },
        ]
    )
    selected_variable_stability.to_csv(selection / "selected_variable_stability.csv", index=False)
    pd.DataFrame(
        [
            {
                "experiment": "hospital_disjoint",
                "method": "missingness_aware",
                "min_hospital_admissions": 500,
                "alpha": 0.1,
                "selection_fraction": 0.1,
                "model_type": "xgboost",
                "missingness_grouping_strategy": "coverage_gap_variable",
                "mask_strategy": "none",
                "mask_rate": 0.0,
                "selective_feature_group": "labs",
                "weighted_shrinkage_lambda": 0.5,
                "run_count": 4,
                "mean_pairwise_jaccard": 0.3,
                "min_pairwise_jaccard": 0.0,
            }
        ]
    ).to_csv(selection / "selected_variable_jaccard_summary.csv", index=False)
    grouping_repeated[grouping_repeated["method"] == "missingness_aware"].to_csv(
        selection / "repeated_summary.csv",
        index=False,
    )
    pd.DataFrame(
        [
            {
                "run_id": "seed0",
                "method": "missingness_aware",
                "selected_variable": "lactate",
                "model_type": "xgboost",
                "empirical_coverage": 0.90,
                "max_group_coverage_gap": 0.02,
                "average_set_size": 0.97,
                "worst_hospital_coverage": 0.88,
            },
            {
                "run_id": "seed1",
                "method": "missingness_aware",
                "selected_variable": "pao2fio2ratio",
                "model_type": "xgboost",
                "empirical_coverage": 0.89,
                "max_group_coverage_gap": 0.03,
                "average_set_size": 0.96,
                "worst_hospital_coverage": 0.87,
            },
        ]
    ).to_csv(selection / "overall_summary.csv", index=False)

    paths = run_appendix_phase2_analysis(
        AppendixPhase2Config(
            alpha_sweep_dirs=(alpha005, alpha010),
            grouping_sweep_dir=grouping,
            selection_sweep_dir=selection,
            output_dir=output_dir,
        )
    )

    alpha_summary = pd.read_csv(paths["alpha_sweep_summary"])
    grouping_summary = pd.read_csv(paths["grouping_baselines_summary"])
    stability_summary = pd.read_csv(paths["selection_stability_summary"])
    performance_by_variable = pd.read_csv(paths["selection_performance_by_variable"])
    manifest = json.loads(paths["manifest"].read_text(encoding="utf-8"))

    assert set(alpha_summary["alpha"]) == {0.05, 0.10}
    assert "source_dir" in alpha_summary.columns
    assert "gap_reduction_vs_standard_mean" in grouping_summary.columns
    assert stability_summary["top_selected_variable"].iloc[0] == "lactate"
    assert stability_summary["top_selected_variable_frequency"].iloc[0] == 0.5
    assert set(performance_by_variable["selected_variable"]) == {"lactate", "pao2fio2ratio"}
    assert Path(manifest["alpha_sweep_summary"]).exists()