misscp / tests /test_appendix_weighted_baselines_analysis.py
Anonymous
Initial anonymous MissCP release
32f5a65
from __future__ import annotations
import json
from pathlib import Path
import pandas as pd
def test_run_weighted_baselines_analysis_writes_summary_and_subgroup_outputs(tmp_path: Path) -> None:
from sepsis_mcp.appendix_weighted_baselines_analysis import (
WeightedBaselinesAnalysisConfig,
run_weighted_baselines_analysis,
)
sweep_dir = tmp_path / "sweep"
output_dir = tmp_path / "analysis"
sweep_dir.mkdir(parents=True, exist_ok=True)
pd.DataFrame(
[
{
"experiment": "hospital_disjoint",
"method": "standard",
"min_hospital_admissions": 500,
"alpha": 0.1,
"selection_fraction": 0.1,
"model_type": "xgboost",
"missingness_grouping_strategy": "coverage_gap_variable",
"mask_strategy": "none",
"mask_rate": 0.0,
"selective_feature_group": "labs",
"weighted_shrinkage_lambda": 0.5,
"run_count": 3,
"empirical_coverage_mean": 0.90,
"max_group_coverage_gap_mean": 0.05,
"average_set_size_mean": 0.95,
},
{
"experiment": "hospital_disjoint",
"method": "weighted_propensity_logistic",
"min_hospital_admissions": 500,
"alpha": 0.1,
"selection_fraction": 0.1,
"model_type": "xgboost",
"missingness_grouping_strategy": "coverage_gap_variable",
"mask_strategy": "none",
"mask_rate": 0.0,
"selective_feature_group": "labs",
"weighted_shrinkage_lambda": 0.5,
"run_count": 3,
"empirical_coverage_mean": 0.91,
"max_group_coverage_gap_mean": 0.03,
"average_set_size_mean": 0.96,
},
]
).to_csv(sweep_dir / "repeated_summary.csv", index=False)
pd.DataFrame(
[
{
"run_id": "seed0",
"method": "standard",
"model_type": "xgboost",
"group_label": "low",
"count": 100,
"coverage": 0.85,
"coverage_gap": -0.05,
"average_set_size": 0.95,
},
{
"run_id": "seed0",
"method": "weighted_propensity_logistic",
"model_type": "xgboost",
"group_label": "low",
"count": 100,
"coverage": 0.89,
"coverage_gap": -0.01,
"average_set_size": 0.96,
},
]
).to_csv(sweep_dir / "subgroup_summary.csv", index=False)
paths = run_weighted_baselines_analysis(
WeightedBaselinesAnalysisConfig(
sweep_dir=sweep_dir,
output_dir=output_dir,
)
)
summary = pd.read_csv(paths["weighted_baselines_summary"])
subgroups = pd.read_csv(paths["weighted_baselines_subgroups"])
manifest = json.loads(paths["manifest"].read_text(encoding="utf-8"))
assert "gap_reduction_vs_standard_mean" in summary.columns
assert summary.loc[summary["method"] == "weighted_propensity_logistic", "gap_reduction_vs_standard_mean"].iloc[0] == 0.02
assert set(subgroups["method"]) == {"standard", "weighted_propensity_logistic"}
assert Path(manifest["weighted_baselines_summary"]).exists()