File size: 6,018 Bytes
32f5a65
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
from __future__ import annotations

import json
from pathlib import Path

import matplotlib.pyplot as plt
import pandas as pd
import pytest


def _write_dummy_png(path: Path) -> Path:
    path.parent.mkdir(parents=True, exist_ok=True)
    fig, ax = plt.subplots(figsize=(1.5, 1.0))
    ax.plot([0, 1], [0, 1])
    fig.savefig(path, dpi=100)
    plt.close(fig)
    return path


def test_run_appendix_artifacts_writes_bundle(tmp_path: Path) -> None:
    from sepsis_mcp.neurips_appendix_artifacts import run_appendix_artifacts

    score_dir = tmp_path / "score"
    eta_dir = tmp_path / "eta"
    calsize_dir = tmp_path / "calsize"
    model_swap_dir = tmp_path / "swap"
    stress_dir = tmp_path / "stress"
    output_dir = tmp_path / "artifacts"

    score_dir.mkdir()
    eta_dir.mkdir()
    calsize_dir.mkdir()
    model_swap_dir.mkdir()
    stress_dir.mkdir()

    pd.DataFrame(
        [
            {"dataset": "gossis", "model_type": "xgboost", "seed": 0, "split_kind": "calibration", "group": 0, "group_label": "low", "score": 0.1},
            {"dataset": "gossis", "model_type": "xgboost", "seed": 0, "split_kind": "calibration", "group": 1, "group_label": "high", "score": 0.8},
            {"dataset": "gossis", "model_type": "xgboost", "seed": 0, "split_kind": "test", "group": 0, "group_label": "low", "score": 0.2},
            {"dataset": "gossis", "model_type": "xgboost", "seed": 0, "split_kind": "test", "group": 1, "group_label": "high", "score": 0.9},
        ]
    ).to_csv(score_dir / "score_records.csv", index=False)
    pd.DataFrame(
        [
            {
                "model_type": "xgboost",
                "group_label": "low",
                "mean_eta": 0.20,
                "std_eta": 0.01,
                "mean_delta": 0.05,
                "std_delta": 0.01,
                "mean_cost": 0.02,
                "advantage_rate": 1.0,
            }
        ]
    ).to_csv(eta_dir / "eta_aggregate.csv", index=False)
    pd.DataFrame(
        [
            {"cal_size": 250, "cal_size_label": "250", "method": "standard", "coverage_mean": 0.88, "max_gap_mean": 0.06, "set_size_mean": 0.92},
            {"cal_size": 250, "cal_size_label": "250", "method": "missingness_aware", "coverage_mean": 0.90, "max_gap_mean": 0.03, "set_size_mean": 0.94},
        ]
    ).to_csv(calsize_dir / "calsize_summary.csv", index=False)
    _write_dummy_png(calsize_dir / "calsize_panels.png")
    pd.DataFrame(
        [
            {
                "calibrate_model": "logistic_regression",
                "deploy_model": "xgboost",
                "grouping": "coverage_gap_variable",
                "mean_gap": 0.04,
                "mean_coverage": 0.90,
                "mean_test_group_overlap_vs_self": 1.0,
                "mean_coverage_drop_vs_self": 0.01,
                "mean_gap_increase_vs_self": 0.00,
            },
            {
                "calibrate_model": "xgboost",
                "deploy_model": "xgboost",
                "grouping": "coverage_gap_variable",
                "mean_gap": 0.02,
                "mean_coverage": 0.91,
                "mean_test_group_overlap_vs_self": 1.0,
                "mean_coverage_drop_vs_self": 0.00,
                "mean_gap_increase_vs_self": 0.00,
            },
        ]
    ).to_csv(model_swap_dir / "swap_matrix.csv", index=False)
    pd.DataFrame(
        [
            {
                "comparison": "coverage_gap_variable_vs_predicted_risk_tercile_frozen",
                "win_grouping": "coverage_gap_variable",
                "win_count": 3,
                "mean_gap_difference": 0.02,
            }
        ]
    ).to_csv(model_swap_dir / "swap_win_summary.csv", index=False)
    pd.DataFrame(
        [
            {"perturbation_type": "mar_dropout", "perturbation_strength": 0.0, "drop_rate": 0.0, "method": "standard", "mean_gap": 0.03, "std_gap": 0.0, "run_count": 1},
            {"perturbation_type": "mar_dropout", "perturbation_strength": 0.3, "drop_rate": 0.3, "method": "standard", "mean_gap": 0.06, "std_gap": 0.0, "run_count": 1},
            {"perturbation_type": "mar_dropout", "perturbation_strength": 0.0, "drop_rate": 0.0, "method": "missingness_aware", "mean_gap": 0.02, "std_gap": 0.0, "run_count": 1},
            {"perturbation_type": "mar_dropout", "perturbation_strength": 0.3, "drop_rate": 0.3, "method": "missingness_aware", "mean_gap": 0.03, "std_gap": 0.0, "run_count": 1},
        ]
    ).to_csv(stress_dir / "stress_aggregate.csv", index=False)

    paths = run_appendix_artifacts(
        score_dir=score_dir,
        eta_dir=eta_dir,
        calsize_dir=calsize_dir,
        model_swap_dir=model_swap_dir,
        stress_dir=stress_dir,
        output_dir=output_dir,
    )

    assert Path(paths["score_figure"]).exists()
    assert Path(paths["eta_figure"]).exists()
    assert Path(paths["swap_figure"]).exists()
    assert Path(paths["stress_figure"]).exists()

    manifest = json.loads((output_dir / "manifest.json").read_text(encoding="utf-8"))
    assert "score_figure" in manifest
    assert "provenance" in manifest
    assert (output_dir / "source_tables" / "score_records.csv").exists()
    assert (output_dir / "source_tables" / "appendix_eta_focus_table.csv").exists()
    eta_focus = pd.read_csv(output_dir / "source_tables" / "appendix_eta_focus_table.csv")
    assert "mean_eta_minus_cost" in eta_focus.columns
    assert "mean_eta_minus_delta" not in eta_focus.columns
    assert eta_focus.loc[0, "mean_eta_minus_cost"] == pytest.approx(0.18)
    assert (output_dir / "appendix" / "calsize_panels.png").exists()

    provenance = json.loads((output_dir / "provenance.json").read_text(encoding="utf-8"))
    assert "git" in provenance
    assert "inputs" in provenance
    assert "artifacts" in provenance
    assert "appendix_score_ecdfs.png" in provenance["artifacts"]
    score_info = provenance["artifacts"]["appendix_score_ecdfs.png"]
    assert score_info["derived_from"] == ["score_records.csv"]
    assert (output_dir / "TRACEABILITY.md").exists()