misscp / tests /test_appendix_phase2_analysis.py

Anonymous

Initial anonymous MissCP release

32f5a65 24 days ago

8.93 kB

	from __future__ import annotations

	import json
	from pathlib import Path

	import pandas as pd


	def test_run_appendix_phase2_analysis_writes_expected_outputs(tmp_path: Path) -> None:
	from sepsis_mcp.appendix_phase2_analysis import AppendixPhase2Config, run_appendix_phase2_analysis

	alpha005 = tmp_path / "alpha005"
	alpha010 = tmp_path / "alpha010"
	grouping = tmp_path / "grouping"
	selection = tmp_path / "selection"
	output_dir = tmp_path / "phase2"
	for path in (alpha005, alpha010, grouping, selection):
	path.mkdir(parents=True, exist_ok=True)

	repeated_alpha = pd.DataFrame(
	[
	{
	"experiment": "hospital_disjoint",
	"method": "standard",
	"min_hospital_admissions": 500,
	"alpha": 0.05,
	"selection_fraction": 0.1,
	"model_type": "xgboost",
	"missingness_grouping_strategy": "coverage_gap_variable",
	"mask_strategy": "none",
	"mask_rate": 0.0,
	"selective_feature_group": "labs",
	"weighted_shrinkage_lambda": 0.5,
	"run_count": 2,
	"empirical_coverage_mean": 0.95,
	"empirical_coverage_std": 0.01,
	"max_group_coverage_gap_mean": 0.04,
	"max_group_coverage_gap_std": 0.01,
	"average_set_size_mean": 1.1,
	"average_set_size_std": 0.1,
	"worst_hospital_coverage_mean": 0.88,
	"worst_hospital_coverage_std": 0.02,
	"wcr": 0.82,
	},
	{
	"experiment": "hospital_disjoint",
	"method": "missingness_aware",
	"min_hospital_admissions": 500,
	"alpha": 0.05,
	"selection_fraction": 0.1,
	"model_type": "xgboost",
	"missingness_grouping_strategy": "coverage_gap_variable",
	"mask_strategy": "none",
	"mask_rate": 0.0,
	"selective_feature_group": "labs",
	"weighted_shrinkage_lambda": 0.5,
	"run_count": 2,
	"empirical_coverage_mean": 0.95,
	"empirical_coverage_std": 0.01,
	"max_group_coverage_gap_mean": 0.02,
	"max_group_coverage_gap_std": 0.01,
	"average_set_size_mean": 1.12,
	"average_set_size_std": 0.1,
	"worst_hospital_coverage_mean": 0.90,
	"worst_hospital_coverage_std": 0.02,
	"wcr": 0.90,
	},
	]
	)
	repeated_alpha.to_csv(alpha005 / "repeated_summary.csv", index=False)
	repeated_alpha.assign(alpha=0.10).to_csv(alpha010 / "repeated_summary.csv", index=False)

	grouping_repeated = pd.DataFrame(
	[
	{
	"experiment": "hospital_disjoint",
	"method": "standard",
	"min_hospital_admissions": 500,
	"alpha": 0.1,
	"selection_fraction": 0.1,
	"model_type": "xgboost",
	"missingness_grouping_strategy": "coverage_gap_variable",
	"mask_strategy": "none",
	"mask_rate": 0.0,
	"selective_feature_group": "labs",
	"weighted_shrinkage_lambda": 0.5,
	"run_count": 2,
	"empirical_coverage_mean": 0.90,
	"max_group_coverage_gap_mean": 0.05,
	"average_set_size_mean": 0.95,
	"worst_hospital_coverage_mean": 0.84,
	"wcr": 0.80,
	},
	{
	"experiment": "hospital_disjoint",
	"method": "missingness_aware",
	"min_hospital_admissions": 500,
	"alpha": 0.1,
	"selection_fraction": 0.1,
	"model_type": "xgboost",
	"missingness_grouping_strategy": "coverage_gap_variable",
	"mask_strategy": "none",
	"mask_rate": 0.0,
	"selective_feature_group": "labs",
	"weighted_shrinkage_lambda": 0.5,
	"run_count": 2,
	"empirical_coverage_mean": 0.90,
	"max_group_coverage_gap_mean": 0.02,
	"average_set_size_mean": 0.97,
	"worst_hospital_coverage_mean": 0.88,
	"wcr": 0.88,
	},
	]
	)
	grouping_repeated.to_csv(grouping / "repeated_summary.csv", index=False)

	selected_variable_stability = pd.DataFrame(
	[
	{
	"experiment": "hospital_disjoint",
	"method": "missingness_aware",
	"min_hospital_admissions": 500,
	"alpha": 0.1,
	"selection_fraction": 0.1,
	"model_type": "xgboost",
	"missingness_grouping_strategy": "coverage_gap_variable",
	"mask_strategy": "none",
	"mask_rate": 0.0,
	"selective_feature_group": "labs",
	"weighted_shrinkage_lambda": 0.5,
	"selected_variable": "lactate",
	"count": 2,
	"frequency": 0.5,
	},
	{
	"experiment": "hospital_disjoint",
	"method": "missingness_aware",
	"min_hospital_admissions": 500,
	"alpha": 0.1,
	"selection_fraction": 0.1,
	"model_type": "xgboost",
	"missingness_grouping_strategy": "coverage_gap_variable",
	"mask_strategy": "none",
	"mask_rate": 0.0,
	"selective_feature_group": "labs",
	"weighted_shrinkage_lambda": 0.5,
	"selected_variable": "pao2fio2ratio",
	"count": 1,
	"frequency": 0.25,
	},
	]
	)
	selected_variable_stability.to_csv(selection / "selected_variable_stability.csv", index=False)
	pd.DataFrame(
	[
	{
	"experiment": "hospital_disjoint",
	"method": "missingness_aware",
	"min_hospital_admissions": 500,
	"alpha": 0.1,
	"selection_fraction": 0.1,
	"model_type": "xgboost",
	"missingness_grouping_strategy": "coverage_gap_variable",
	"mask_strategy": "none",
	"mask_rate": 0.0,
	"selective_feature_group": "labs",
	"weighted_shrinkage_lambda": 0.5,
	"run_count": 4,
	"mean_pairwise_jaccard": 0.3,
	"min_pairwise_jaccard": 0.0,
	}
	]
	).to_csv(selection / "selected_variable_jaccard_summary.csv", index=False)
	grouping_repeated[grouping_repeated["method"] == "missingness_aware"].to_csv(
	selection / "repeated_summary.csv",
	index=False,
	)
	pd.DataFrame(
	[
	{
	"run_id": "seed0",
	"method": "missingness_aware",
	"selected_variable": "lactate",
	"model_type": "xgboost",
	"empirical_coverage": 0.90,
	"max_group_coverage_gap": 0.02,
	"average_set_size": 0.97,
	"worst_hospital_coverage": 0.88,
	},
	{
	"run_id": "seed1",
	"method": "missingness_aware",
	"selected_variable": "pao2fio2ratio",
	"model_type": "xgboost",
	"empirical_coverage": 0.89,
	"max_group_coverage_gap": 0.03,
	"average_set_size": 0.96,
	"worst_hospital_coverage": 0.87,
	},
	]
	).to_csv(selection / "overall_summary.csv", index=False)

	paths = run_appendix_phase2_analysis(
	AppendixPhase2Config(
	alpha_sweep_dirs=(alpha005, alpha010),
	grouping_sweep_dir=grouping,
	selection_sweep_dir=selection,
	output_dir=output_dir,
	)
	)

	alpha_summary = pd.read_csv(paths["alpha_sweep_summary"])
	grouping_summary = pd.read_csv(paths["grouping_baselines_summary"])
	stability_summary = pd.read_csv(paths["selection_stability_summary"])
	performance_by_variable = pd.read_csv(paths["selection_performance_by_variable"])
	manifest = json.loads(paths["manifest"].read_text(encoding="utf-8"))

	assert set(alpha_summary["alpha"]) == {0.05, 0.10}
	assert "source_dir" in alpha_summary.columns
	assert "gap_reduction_vs_standard_mean" in grouping_summary.columns
	assert stability_summary["top_selected_variable"].iloc[0] == "lactate"
	assert stability_summary["top_selected_variable_frequency"].iloc[0] == 0.5
	assert set(performance_by_variable["selected_variable"]) == {"lactate", "pao2fio2ratio"}
	assert Path(manifest["alpha_sweep_summary"]).exists()