| from __future__ import annotations |
|
|
| import pytest |
| import pandas as pd |
|
|
|
|
| def test_compute_worst_asymmetry_from_subgroups_splits_under_and_over() -> None: |
| from sepsis_mcp.coverage_asymmetry_analysis import compute_worst_asymmetry_from_subgroups |
|
|
| frame = pd.DataFrame( |
| { |
| "dataset": ["demo"] * 4, |
| "method": ["standard"] * 4, |
| "seed": [0] * 4, |
| "coverage": [0.82, 0.93, 0.88, 0.91], |
| "target_coverage": [0.9] * 4, |
| "average_set_size": [0.9, 1.0, 1.1, 1.2], |
| "group_label": ["a", "b", "c", "d"], |
| } |
| ) |
|
|
| result = compute_worst_asymmetry_from_subgroups( |
| frame, |
| group_columns=["dataset", "method", "seed"], |
| ) |
|
|
| assert len(result) == 1 |
| row = result.iloc[0] |
| assert row["worst_under_coverage"] == pytest.approx(0.08) |
| assert row["worst_over_coverage"] == pytest.approx(0.03) |
| assert row["average_absolute_gap"] == pytest.approx((0.08 + 0.03 + 0.02 + 0.01) / 4) |
| assert row["coverage_variance"] == pytest.approx(pd.Series([0.82, 0.93, 0.88, 0.91]).var(ddof=0)) |
|
|
|
|
| def test_build_full_subgroup_summary_aggregates_coverage_count_and_set_size() -> None: |
| from sepsis_mcp.coverage_asymmetry_analysis import build_full_subgroup_summary |
|
|
| frame = pd.DataFrame( |
| { |
| "dataset": ["demo"] * 4, |
| "model_type": ["xgboost"] * 4, |
| "method": ["standard", "standard", "missingness_aware", "missingness_aware"], |
| "group_label": ["low", "high", "low", "high"], |
| "count": [10, 20, 12, 18], |
| "coverage": [0.84, 0.93, 0.89, 0.91], |
| "coverage_gap": [-0.06, 0.03, -0.01, 0.01], |
| "average_set_size": [0.92, 1.01, 0.95, 1.03], |
| } |
| ) |
|
|
| result = build_full_subgroup_summary( |
| frame, |
| group_columns=["dataset", "model_type", "method", "group_label"], |
| ) |
|
|
| assert {"count_mean", "coverage_mean", "average_set_size_mean"} <= set(result.columns) |
| low_standard = result[(result["method"] == "standard") & (result["group_label"] == "low")].iloc[0] |
| assert low_standard["count_mean"] == pytest.approx(10) |
| assert low_standard["coverage_mean"] == pytest.approx(0.84) |
| assert low_standard["average_set_size_mean"] == pytest.approx(0.92) |
|
|
|
|
| def test_run_coverage_asymmetry_analysis_writes_full_decomposition_summary(tmp_path) -> None: |
| from sepsis_mcp.coverage_asymmetry_analysis import CoverageAsymmetryConfig, run_coverage_asymmetry_analysis |
|
|
| gossis_base = tmp_path / "gossis" |
| mimic_base = tmp_path / "mimic" |
| output_dir = tmp_path / "out" |
| for model_dir in ("logistic_regression", "xgboost", "mlp"): |
| model_path = gossis_base / model_dir |
| model_path.mkdir(parents=True) |
| pd.DataFrame( |
| [ |
| { |
| "model_type": model_dir, |
| "method": "standard", |
| "random_state": 0, |
| "group_label": "low", |
| "count": 10, |
| "coverage": 0.82, |
| "coverage_gap": -0.08, |
| "average_set_size": 0.95, |
| "missingness_grouping_strategy": "coverage_gap_variable", |
| }, |
| { |
| "model_type": model_dir, |
| "method": "standard", |
| "random_state": 0, |
| "group_label": "high", |
| "count": 20, |
| "coverage": 0.93, |
| "coverage_gap": 0.03, |
| "average_set_size": 1.02, |
| "missingness_grouping_strategy": "coverage_gap_variable", |
| }, |
| ] |
| ).to_csv(model_path / "subgroup_summary.csv", index=False) |
|
|
| mimic_base.mkdir(parents=True) |
| pd.DataFrame( |
| [ |
| { |
| "model_type": "xgboost", |
| "method": "standard", |
| "split_id": "split0", |
| "group_label": "low", |
| "count": 12, |
| "coverage": 0.84, |
| "coverage_gap": -0.06, |
| "average_set_size": 0.91, |
| }, |
| { |
| "model_type": "xgboost", |
| "method": "standard", |
| "split_id": "split0", |
| "group_label": "high", |
| "count": 18, |
| "coverage": 0.91, |
| "coverage_gap": 0.01, |
| "average_set_size": 1.01, |
| }, |
| ] |
| ).to_csv(mimic_base / "subgroup_runs.csv", index=False) |
|
|
| paths = run_coverage_asymmetry_analysis( |
| CoverageAsymmetryConfig( |
| gossis_base_dir=gossis_base, |
| mimic_base_dirs=(mimic_base,), |
| output_dir=output_dir, |
| ) |
| ) |
|
|
| summary = pd.read_csv(paths["summary"]) |
| subgroup_summary = pd.read_csv(paths["subgroup_summary"]) |
|
|
| assert {"average_absolute_gap_mean", "coverage_variance_mean"} <= set(summary.columns) |
| assert {"count_mean", "coverage_mean", "average_set_size_mean"} <= set(subgroup_summary.columns) |
|
|