| from __future__ import annotations |
|
|
| from pathlib import Path |
|
|
| import pandas as pd |
| import pytest |
|
|
| from sepsis_mcp.constants import TARGET_COLUMN |
| from sepsis_mcp.dataset import build_labeled_patient_frame |
| from sepsis_mcp.features import TabularFeaturizer |
| from sepsis_mcp.io import load_patient_frame |
|
|
|
|
| def _write_feature_patient(path: Path) -> Path: |
| path.write_text( |
| "\n".join( |
| [ |
| "HR|O2Sat|Age|Gender|Unit1|Unit2|HospAdmTime|ICULOS|SepsisLabel", |
| "80|95|65|1|1|0|-5|1|0", |
| "82|NaN|65|1|1|0|-5|2|0", |
| "84|97|65|1|1|0|-5|3|0", |
| "88|93|65|1|1|0|-5|4|1", |
| "90|92|65|1|1|0|-5|5|1", |
| ] |
| ), |
| encoding="utf-8", |
| ) |
| return path |
|
|
|
|
| def test_build_patient_feature_frame_computes_window_statistics_and_missingness( |
| tmp_path: Path, |
| ) -> None: |
| patient_path = _write_feature_patient(tmp_path / "p000010.psv") |
| patient_frame = load_patient_frame(patient_path) |
| labeled = build_labeled_patient_frame(patient_frame, patient_id="p000010", horizon_hours=2) |
| featurizer = TabularFeaturizer(dynamic_columns=["HR", "O2Sat"], lookback_hours=3) |
|
|
| raw_features = featurizer.build_patient_feature_frame(patient_frame, labeled) |
| third_sample = raw_features.loc[raw_features["sample_index"] == 2].iloc[0] |
|
|
| assert third_sample["HR_last"] == pytest.approx(84.0) |
| assert third_sample["HR_mean"] == pytest.approx(82.0) |
| assert third_sample["HR_std"] == pytest.approx(1.632993161855452) |
| assert third_sample["HR_slope"] == pytest.approx(2.0) |
| assert third_sample["HR_missing_rate"] == pytest.approx(0.0) |
| assert third_sample["O2Sat_last"] == pytest.approx(97.0) |
| assert third_sample["O2Sat_mean"] == pytest.approx(96.0) |
| assert third_sample["O2Sat_std"] == pytest.approx(1.0) |
| assert third_sample["O2Sat_slope"] == pytest.approx(1.0) |
| assert third_sample["O2Sat_missing_rate"] == pytest.approx(1 / 3) |
| assert third_sample["global_missing_rate"] == pytest.approx(1 / 6) |
| assert third_sample[TARGET_COLUMN] == 1 |
|
|
|
|
| def test_fit_transform_uses_training_medians_for_numeric_feature_imputation() -> None: |
| raw_features = pd.DataFrame( |
| { |
| "patient_id": ["p1", "p2", "p3"], |
| "sample_index": [0, 1, 2], |
| TARGET_COLUMN: [0, 1, 0], |
| "global_missing_rate": [0.0, 0.5, 1.0], |
| "HR_last": [80.0, 100.0, None], |
| "O2Sat_mean": [95.0, None, None], |
| } |
| ) |
| featurizer = TabularFeaturizer(dynamic_columns=["HR", "O2Sat"], lookback_hours=3) |
|
|
| transformed = featurizer.fit_transform(raw_features) |
|
|
| assert featurizer.feature_columns_ == ["HR_last", "O2Sat_mean"] |
| assert featurizer.medians_["HR_last"] == pytest.approx(90.0) |
| assert featurizer.medians_["O2Sat_mean"] == pytest.approx(95.0) |
| assert transformed["HR_last"].tolist() == [80.0, 100.0, 90.0] |
| assert transformed["O2Sat_mean"].tolist() == [95.0, 95.0, 95.0] |
|
|