| from __future__ import annotations |
|
|
| import json |
| from pathlib import Path |
|
|
| import matplotlib.pyplot as plt |
| import pandas as pd |
| import pytest |
|
|
|
|
| def _write_dummy_png(path: Path) -> Path: |
| path.parent.mkdir(parents=True, exist_ok=True) |
| fig, ax = plt.subplots(figsize=(1.5, 1.0)) |
| ax.plot([0, 1], [0, 1]) |
| fig.savefig(path, dpi=100) |
| plt.close(fig) |
| return path |
|
|
|
|
| def test_run_appendix_artifacts_writes_bundle(tmp_path: Path) -> None: |
| from sepsis_mcp.neurips_appendix_artifacts import run_appendix_artifacts |
|
|
| score_dir = tmp_path / "score" |
| eta_dir = tmp_path / "eta" |
| calsize_dir = tmp_path / "calsize" |
| model_swap_dir = tmp_path / "swap" |
| stress_dir = tmp_path / "stress" |
| output_dir = tmp_path / "artifacts" |
|
|
| score_dir.mkdir() |
| eta_dir.mkdir() |
| calsize_dir.mkdir() |
| model_swap_dir.mkdir() |
| stress_dir.mkdir() |
|
|
| pd.DataFrame( |
| [ |
| {"dataset": "gossis", "model_type": "xgboost", "seed": 0, "split_kind": "calibration", "group": 0, "group_label": "low", "score": 0.1}, |
| {"dataset": "gossis", "model_type": "xgboost", "seed": 0, "split_kind": "calibration", "group": 1, "group_label": "high", "score": 0.8}, |
| {"dataset": "gossis", "model_type": "xgboost", "seed": 0, "split_kind": "test", "group": 0, "group_label": "low", "score": 0.2}, |
| {"dataset": "gossis", "model_type": "xgboost", "seed": 0, "split_kind": "test", "group": 1, "group_label": "high", "score": 0.9}, |
| ] |
| ).to_csv(score_dir / "score_records.csv", index=False) |
| pd.DataFrame( |
| [ |
| { |
| "model_type": "xgboost", |
| "group_label": "low", |
| "mean_eta": 0.20, |
| "std_eta": 0.01, |
| "mean_delta": 0.05, |
| "std_delta": 0.01, |
| "mean_cost": 0.02, |
| "advantage_rate": 1.0, |
| } |
| ] |
| ).to_csv(eta_dir / "eta_aggregate.csv", index=False) |
| pd.DataFrame( |
| [ |
| {"cal_size": 250, "cal_size_label": "250", "method": "standard", "coverage_mean": 0.88, "max_gap_mean": 0.06, "set_size_mean": 0.92}, |
| {"cal_size": 250, "cal_size_label": "250", "method": "missingness_aware", "coverage_mean": 0.90, "max_gap_mean": 0.03, "set_size_mean": 0.94}, |
| ] |
| ).to_csv(calsize_dir / "calsize_summary.csv", index=False) |
| _write_dummy_png(calsize_dir / "calsize_panels.png") |
| pd.DataFrame( |
| [ |
| { |
| "calibrate_model": "logistic_regression", |
| "deploy_model": "xgboost", |
| "grouping": "coverage_gap_variable", |
| "mean_gap": 0.04, |
| "mean_coverage": 0.90, |
| "mean_test_group_overlap_vs_self": 1.0, |
| "mean_coverage_drop_vs_self": 0.01, |
| "mean_gap_increase_vs_self": 0.00, |
| }, |
| { |
| "calibrate_model": "xgboost", |
| "deploy_model": "xgboost", |
| "grouping": "coverage_gap_variable", |
| "mean_gap": 0.02, |
| "mean_coverage": 0.91, |
| "mean_test_group_overlap_vs_self": 1.0, |
| "mean_coverage_drop_vs_self": 0.00, |
| "mean_gap_increase_vs_self": 0.00, |
| }, |
| ] |
| ).to_csv(model_swap_dir / "swap_matrix.csv", index=False) |
| pd.DataFrame( |
| [ |
| { |
| "comparison": "coverage_gap_variable_vs_predicted_risk_tercile_frozen", |
| "win_grouping": "coverage_gap_variable", |
| "win_count": 3, |
| "mean_gap_difference": 0.02, |
| } |
| ] |
| ).to_csv(model_swap_dir / "swap_win_summary.csv", index=False) |
| pd.DataFrame( |
| [ |
| {"perturbation_type": "mar_dropout", "perturbation_strength": 0.0, "drop_rate": 0.0, "method": "standard", "mean_gap": 0.03, "std_gap": 0.0, "run_count": 1}, |
| {"perturbation_type": "mar_dropout", "perturbation_strength": 0.3, "drop_rate": 0.3, "method": "standard", "mean_gap": 0.06, "std_gap": 0.0, "run_count": 1}, |
| {"perturbation_type": "mar_dropout", "perturbation_strength": 0.0, "drop_rate": 0.0, "method": "missingness_aware", "mean_gap": 0.02, "std_gap": 0.0, "run_count": 1}, |
| {"perturbation_type": "mar_dropout", "perturbation_strength": 0.3, "drop_rate": 0.3, "method": "missingness_aware", "mean_gap": 0.03, "std_gap": 0.0, "run_count": 1}, |
| ] |
| ).to_csv(stress_dir / "stress_aggregate.csv", index=False) |
|
|
| paths = run_appendix_artifacts( |
| score_dir=score_dir, |
| eta_dir=eta_dir, |
| calsize_dir=calsize_dir, |
| model_swap_dir=model_swap_dir, |
| stress_dir=stress_dir, |
| output_dir=output_dir, |
| ) |
|
|
| assert Path(paths["score_figure"]).exists() |
| assert Path(paths["eta_figure"]).exists() |
| assert Path(paths["swap_figure"]).exists() |
| assert Path(paths["stress_figure"]).exists() |
|
|
| manifest = json.loads((output_dir / "manifest.json").read_text(encoding="utf-8")) |
| assert "score_figure" in manifest |
| assert "provenance" in manifest |
| assert (output_dir / "source_tables" / "score_records.csv").exists() |
| assert (output_dir / "source_tables" / "appendix_eta_focus_table.csv").exists() |
| eta_focus = pd.read_csv(output_dir / "source_tables" / "appendix_eta_focus_table.csv") |
| assert "mean_eta_minus_cost" in eta_focus.columns |
| assert "mean_eta_minus_delta" not in eta_focus.columns |
| assert eta_focus.loc[0, "mean_eta_minus_cost"] == pytest.approx(0.18) |
| assert (output_dir / "appendix" / "calsize_panels.png").exists() |
|
|
| provenance = json.loads((output_dir / "provenance.json").read_text(encoding="utf-8")) |
| assert "git" in provenance |
| assert "inputs" in provenance |
| assert "artifacts" in provenance |
| assert "appendix_score_ecdfs.png" in provenance["artifacts"] |
| score_info = provenance["artifacts"]["appendix_score_ecdfs.png"] |
| assert score_info["derived_from"] == ["score_records.csv"] |
| assert (output_dir / "TRACEABILITY.md").exists() |
|
|