from __future__ import annotations import json from pathlib import Path import pandas as pd def test_run_weighted_baselines_analysis_writes_summary_and_subgroup_outputs(tmp_path: Path) -> None: from sepsis_mcp.appendix_weighted_baselines_analysis import ( WeightedBaselinesAnalysisConfig, run_weighted_baselines_analysis, ) sweep_dir = tmp_path / "sweep" output_dir = tmp_path / "analysis" sweep_dir.mkdir(parents=True, exist_ok=True) pd.DataFrame( [ { "experiment": "hospital_disjoint", "method": "standard", "min_hospital_admissions": 500, "alpha": 0.1, "selection_fraction": 0.1, "model_type": "xgboost", "missingness_grouping_strategy": "coverage_gap_variable", "mask_strategy": "none", "mask_rate": 0.0, "selective_feature_group": "labs", "weighted_shrinkage_lambda": 0.5, "run_count": 3, "empirical_coverage_mean": 0.90, "max_group_coverage_gap_mean": 0.05, "average_set_size_mean": 0.95, }, { "experiment": "hospital_disjoint", "method": "weighted_propensity_logistic", "min_hospital_admissions": 500, "alpha": 0.1, "selection_fraction": 0.1, "model_type": "xgboost", "missingness_grouping_strategy": "coverage_gap_variable", "mask_strategy": "none", "mask_rate": 0.0, "selective_feature_group": "labs", "weighted_shrinkage_lambda": 0.5, "run_count": 3, "empirical_coverage_mean": 0.91, "max_group_coverage_gap_mean": 0.03, "average_set_size_mean": 0.96, }, ] ).to_csv(sweep_dir / "repeated_summary.csv", index=False) pd.DataFrame( [ { "run_id": "seed0", "method": "standard", "model_type": "xgboost", "group_label": "low", "count": 100, "coverage": 0.85, "coverage_gap": -0.05, "average_set_size": 0.95, }, { "run_id": "seed0", "method": "weighted_propensity_logistic", "model_type": "xgboost", "group_label": "low", "count": 100, "coverage": 0.89, "coverage_gap": -0.01, "average_set_size": 0.96, }, ] ).to_csv(sweep_dir / "subgroup_summary.csv", index=False) paths = run_weighted_baselines_analysis( WeightedBaselinesAnalysisConfig( sweep_dir=sweep_dir, output_dir=output_dir, ) ) summary = pd.read_csv(paths["weighted_baselines_summary"]) subgroups = pd.read_csv(paths["weighted_baselines_subgroups"]) manifest = json.loads(paths["manifest"].read_text(encoding="utf-8")) assert "gap_reduction_vs_standard_mean" in summary.columns assert summary.loc[summary["method"] == "weighted_propensity_logistic", "gap_reduction_vs_standard_mean"].iloc[0] == 0.02 assert set(subgroups["method"]) == {"standard", "weighted_propensity_logistic"} assert Path(manifest["weighted_baselines_summary"]).exists()