from __future__ import annotations import json from pathlib import Path import matplotlib.pyplot as plt import pandas as pd import pytest def _write_dummy_png(path: Path) -> Path: path.parent.mkdir(parents=True, exist_ok=True) fig, ax = plt.subplots(figsize=(1.5, 1.0)) ax.plot([0, 1], [0, 1]) fig.savefig(path, dpi=100) plt.close(fig) return path def test_run_appendix_artifacts_writes_bundle(tmp_path: Path) -> None: from sepsis_mcp.neurips_appendix_artifacts import run_appendix_artifacts score_dir = tmp_path / "score" eta_dir = tmp_path / "eta" calsize_dir = tmp_path / "calsize" model_swap_dir = tmp_path / "swap" stress_dir = tmp_path / "stress" output_dir = tmp_path / "artifacts" score_dir.mkdir() eta_dir.mkdir() calsize_dir.mkdir() model_swap_dir.mkdir() stress_dir.mkdir() pd.DataFrame( [ {"dataset": "gossis", "model_type": "xgboost", "seed": 0, "split_kind": "calibration", "group": 0, "group_label": "low", "score": 0.1}, {"dataset": "gossis", "model_type": "xgboost", "seed": 0, "split_kind": "calibration", "group": 1, "group_label": "high", "score": 0.8}, {"dataset": "gossis", "model_type": "xgboost", "seed": 0, "split_kind": "test", "group": 0, "group_label": "low", "score": 0.2}, {"dataset": "gossis", "model_type": "xgboost", "seed": 0, "split_kind": "test", "group": 1, "group_label": "high", "score": 0.9}, ] ).to_csv(score_dir / "score_records.csv", index=False) pd.DataFrame( [ { "model_type": "xgboost", "group_label": "low", "mean_eta": 0.20, "std_eta": 0.01, "mean_delta": 0.05, "std_delta": 0.01, "mean_cost": 0.02, "advantage_rate": 1.0, } ] ).to_csv(eta_dir / "eta_aggregate.csv", index=False) pd.DataFrame( [ {"cal_size": 250, "cal_size_label": "250", "method": "standard", "coverage_mean": 0.88, "max_gap_mean": 0.06, "set_size_mean": 0.92}, {"cal_size": 250, "cal_size_label": "250", "method": "missingness_aware", "coverage_mean": 0.90, "max_gap_mean": 0.03, "set_size_mean": 0.94}, ] ).to_csv(calsize_dir / "calsize_summary.csv", index=False) _write_dummy_png(calsize_dir / "calsize_panels.png") pd.DataFrame( [ { "calibrate_model": "logistic_regression", "deploy_model": "xgboost", "grouping": "coverage_gap_variable", "mean_gap": 0.04, "mean_coverage": 0.90, "mean_test_group_overlap_vs_self": 1.0, "mean_coverage_drop_vs_self": 0.01, "mean_gap_increase_vs_self": 0.00, }, { "calibrate_model": "xgboost", "deploy_model": "xgboost", "grouping": "coverage_gap_variable", "mean_gap": 0.02, "mean_coverage": 0.91, "mean_test_group_overlap_vs_self": 1.0, "mean_coverage_drop_vs_self": 0.00, "mean_gap_increase_vs_self": 0.00, }, ] ).to_csv(model_swap_dir / "swap_matrix.csv", index=False) pd.DataFrame( [ { "comparison": "coverage_gap_variable_vs_predicted_risk_tercile_frozen", "win_grouping": "coverage_gap_variable", "win_count": 3, "mean_gap_difference": 0.02, } ] ).to_csv(model_swap_dir / "swap_win_summary.csv", index=False) pd.DataFrame( [ {"perturbation_type": "mar_dropout", "perturbation_strength": 0.0, "drop_rate": 0.0, "method": "standard", "mean_gap": 0.03, "std_gap": 0.0, "run_count": 1}, {"perturbation_type": "mar_dropout", "perturbation_strength": 0.3, "drop_rate": 0.3, "method": "standard", "mean_gap": 0.06, "std_gap": 0.0, "run_count": 1}, {"perturbation_type": "mar_dropout", "perturbation_strength": 0.0, "drop_rate": 0.0, "method": "missingness_aware", "mean_gap": 0.02, "std_gap": 0.0, "run_count": 1}, {"perturbation_type": "mar_dropout", "perturbation_strength": 0.3, "drop_rate": 0.3, "method": "missingness_aware", "mean_gap": 0.03, "std_gap": 0.0, "run_count": 1}, ] ).to_csv(stress_dir / "stress_aggregate.csv", index=False) paths = run_appendix_artifacts( score_dir=score_dir, eta_dir=eta_dir, calsize_dir=calsize_dir, model_swap_dir=model_swap_dir, stress_dir=stress_dir, output_dir=output_dir, ) assert Path(paths["score_figure"]).exists() assert Path(paths["eta_figure"]).exists() assert Path(paths["swap_figure"]).exists() assert Path(paths["stress_figure"]).exists() manifest = json.loads((output_dir / "manifest.json").read_text(encoding="utf-8")) assert "score_figure" in manifest assert "provenance" in manifest assert (output_dir / "source_tables" / "score_records.csv").exists() assert (output_dir / "source_tables" / "appendix_eta_focus_table.csv").exists() eta_focus = pd.read_csv(output_dir / "source_tables" / "appendix_eta_focus_table.csv") assert "mean_eta_minus_cost" in eta_focus.columns assert "mean_eta_minus_delta" not in eta_focus.columns assert eta_focus.loc[0, "mean_eta_minus_cost"] == pytest.approx(0.18) assert (output_dir / "appendix" / "calsize_panels.png").exists() provenance = json.loads((output_dir / "provenance.json").read_text(encoding="utf-8")) assert "git" in provenance assert "inputs" in provenance assert "artifacts" in provenance assert "appendix_score_ecdfs.png" in provenance["artifacts"] score_info = provenance["artifacts"]["appendix_score_ecdfs.png"] assert score_info["derived_from"] == ["score_records.csv"] assert (output_dir / "TRACEABILITY.md").exists()