misscp / tests /test_appendix_phase2_analysis.py
Anonymous
Initial anonymous MissCP release
32f5a65
from __future__ import annotations
import json
from pathlib import Path
import pandas as pd
def test_run_appendix_phase2_analysis_writes_expected_outputs(tmp_path: Path) -> None:
from sepsis_mcp.appendix_phase2_analysis import AppendixPhase2Config, run_appendix_phase2_analysis
alpha005 = tmp_path / "alpha005"
alpha010 = tmp_path / "alpha010"
grouping = tmp_path / "grouping"
selection = tmp_path / "selection"
output_dir = tmp_path / "phase2"
for path in (alpha005, alpha010, grouping, selection):
path.mkdir(parents=True, exist_ok=True)
repeated_alpha = pd.DataFrame(
[
{
"experiment": "hospital_disjoint",
"method": "standard",
"min_hospital_admissions": 500,
"alpha": 0.05,
"selection_fraction": 0.1,
"model_type": "xgboost",
"missingness_grouping_strategy": "coverage_gap_variable",
"mask_strategy": "none",
"mask_rate": 0.0,
"selective_feature_group": "labs",
"weighted_shrinkage_lambda": 0.5,
"run_count": 2,
"empirical_coverage_mean": 0.95,
"empirical_coverage_std": 0.01,
"max_group_coverage_gap_mean": 0.04,
"max_group_coverage_gap_std": 0.01,
"average_set_size_mean": 1.1,
"average_set_size_std": 0.1,
"worst_hospital_coverage_mean": 0.88,
"worst_hospital_coverage_std": 0.02,
"wcr": 0.82,
},
{
"experiment": "hospital_disjoint",
"method": "missingness_aware",
"min_hospital_admissions": 500,
"alpha": 0.05,
"selection_fraction": 0.1,
"model_type": "xgboost",
"missingness_grouping_strategy": "coverage_gap_variable",
"mask_strategy": "none",
"mask_rate": 0.0,
"selective_feature_group": "labs",
"weighted_shrinkage_lambda": 0.5,
"run_count": 2,
"empirical_coverage_mean": 0.95,
"empirical_coverage_std": 0.01,
"max_group_coverage_gap_mean": 0.02,
"max_group_coverage_gap_std": 0.01,
"average_set_size_mean": 1.12,
"average_set_size_std": 0.1,
"worst_hospital_coverage_mean": 0.90,
"worst_hospital_coverage_std": 0.02,
"wcr": 0.90,
},
]
)
repeated_alpha.to_csv(alpha005 / "repeated_summary.csv", index=False)
repeated_alpha.assign(alpha=0.10).to_csv(alpha010 / "repeated_summary.csv", index=False)
grouping_repeated = pd.DataFrame(
[
{
"experiment": "hospital_disjoint",
"method": "standard",
"min_hospital_admissions": 500,
"alpha": 0.1,
"selection_fraction": 0.1,
"model_type": "xgboost",
"missingness_grouping_strategy": "coverage_gap_variable",
"mask_strategy": "none",
"mask_rate": 0.0,
"selective_feature_group": "labs",
"weighted_shrinkage_lambda": 0.5,
"run_count": 2,
"empirical_coverage_mean": 0.90,
"max_group_coverage_gap_mean": 0.05,
"average_set_size_mean": 0.95,
"worst_hospital_coverage_mean": 0.84,
"wcr": 0.80,
},
{
"experiment": "hospital_disjoint",
"method": "missingness_aware",
"min_hospital_admissions": 500,
"alpha": 0.1,
"selection_fraction": 0.1,
"model_type": "xgboost",
"missingness_grouping_strategy": "coverage_gap_variable",
"mask_strategy": "none",
"mask_rate": 0.0,
"selective_feature_group": "labs",
"weighted_shrinkage_lambda": 0.5,
"run_count": 2,
"empirical_coverage_mean": 0.90,
"max_group_coverage_gap_mean": 0.02,
"average_set_size_mean": 0.97,
"worst_hospital_coverage_mean": 0.88,
"wcr": 0.88,
},
]
)
grouping_repeated.to_csv(grouping / "repeated_summary.csv", index=False)
selected_variable_stability = pd.DataFrame(
[
{
"experiment": "hospital_disjoint",
"method": "missingness_aware",
"min_hospital_admissions": 500,
"alpha": 0.1,
"selection_fraction": 0.1,
"model_type": "xgboost",
"missingness_grouping_strategy": "coverage_gap_variable",
"mask_strategy": "none",
"mask_rate": 0.0,
"selective_feature_group": "labs",
"weighted_shrinkage_lambda": 0.5,
"selected_variable": "lactate",
"count": 2,
"frequency": 0.5,
},
{
"experiment": "hospital_disjoint",
"method": "missingness_aware",
"min_hospital_admissions": 500,
"alpha": 0.1,
"selection_fraction": 0.1,
"model_type": "xgboost",
"missingness_grouping_strategy": "coverage_gap_variable",
"mask_strategy": "none",
"mask_rate": 0.0,
"selective_feature_group": "labs",
"weighted_shrinkage_lambda": 0.5,
"selected_variable": "pao2fio2ratio",
"count": 1,
"frequency": 0.25,
},
]
)
selected_variable_stability.to_csv(selection / "selected_variable_stability.csv", index=False)
pd.DataFrame(
[
{
"experiment": "hospital_disjoint",
"method": "missingness_aware",
"min_hospital_admissions": 500,
"alpha": 0.1,
"selection_fraction": 0.1,
"model_type": "xgboost",
"missingness_grouping_strategy": "coverage_gap_variable",
"mask_strategy": "none",
"mask_rate": 0.0,
"selective_feature_group": "labs",
"weighted_shrinkage_lambda": 0.5,
"run_count": 4,
"mean_pairwise_jaccard": 0.3,
"min_pairwise_jaccard": 0.0,
}
]
).to_csv(selection / "selected_variable_jaccard_summary.csv", index=False)
grouping_repeated[grouping_repeated["method"] == "missingness_aware"].to_csv(
selection / "repeated_summary.csv",
index=False,
)
pd.DataFrame(
[
{
"run_id": "seed0",
"method": "missingness_aware",
"selected_variable": "lactate",
"model_type": "xgboost",
"empirical_coverage": 0.90,
"max_group_coverage_gap": 0.02,
"average_set_size": 0.97,
"worst_hospital_coverage": 0.88,
},
{
"run_id": "seed1",
"method": "missingness_aware",
"selected_variable": "pao2fio2ratio",
"model_type": "xgboost",
"empirical_coverage": 0.89,
"max_group_coverage_gap": 0.03,
"average_set_size": 0.96,
"worst_hospital_coverage": 0.87,
},
]
).to_csv(selection / "overall_summary.csv", index=False)
paths = run_appendix_phase2_analysis(
AppendixPhase2Config(
alpha_sweep_dirs=(alpha005, alpha010),
grouping_sweep_dir=grouping,
selection_sweep_dir=selection,
output_dir=output_dir,
)
)
alpha_summary = pd.read_csv(paths["alpha_sweep_summary"])
grouping_summary = pd.read_csv(paths["grouping_baselines_summary"])
stability_summary = pd.read_csv(paths["selection_stability_summary"])
performance_by_variable = pd.read_csv(paths["selection_performance_by_variable"])
manifest = json.loads(paths["manifest"].read_text(encoding="utf-8"))
assert set(alpha_summary["alpha"]) == {0.05, 0.10}
assert "source_dir" in alpha_summary.columns
assert "gap_reduction_vs_standard_mean" in grouping_summary.columns
assert stability_summary["top_selected_variable"].iloc[0] == "lactate"
assert stability_summary["top_selected_variable_frequency"].iloc[0] == 0.5
assert set(performance_by_variable["selected_variable"]) == {"lactate", "pao2fio2ratio"}
assert Path(manifest["alpha_sweep_summary"]).exists()