from __future__ import annotations from pathlib import Path import pandas as pd import pytest from sepsis_mcp.constants import TARGET_COLUMN from sepsis_mcp.dataset import build_labeled_patient_frame from sepsis_mcp.features import TabularFeaturizer from sepsis_mcp.io import load_patient_frame def _write_feature_patient(path: Path) -> Path: path.write_text( "\n".join( [ "HR|O2Sat|Age|Gender|Unit1|Unit2|HospAdmTime|ICULOS|SepsisLabel", "80|95|65|1|1|0|-5|1|0", "82|NaN|65|1|1|0|-5|2|0", "84|97|65|1|1|0|-5|3|0", "88|93|65|1|1|0|-5|4|1", "90|92|65|1|1|0|-5|5|1", ] ), encoding="utf-8", ) return path def test_build_patient_feature_frame_computes_window_statistics_and_missingness( tmp_path: Path, ) -> None: patient_path = _write_feature_patient(tmp_path / "p000010.psv") patient_frame = load_patient_frame(patient_path) labeled = build_labeled_patient_frame(patient_frame, patient_id="p000010", horizon_hours=2) featurizer = TabularFeaturizer(dynamic_columns=["HR", "O2Sat"], lookback_hours=3) raw_features = featurizer.build_patient_feature_frame(patient_frame, labeled) third_sample = raw_features.loc[raw_features["sample_index"] == 2].iloc[0] assert third_sample["HR_last"] == pytest.approx(84.0) assert third_sample["HR_mean"] == pytest.approx(82.0) assert third_sample["HR_std"] == pytest.approx(1.632993161855452) assert third_sample["HR_slope"] == pytest.approx(2.0) assert third_sample["HR_missing_rate"] == pytest.approx(0.0) assert third_sample["O2Sat_last"] == pytest.approx(97.0) assert third_sample["O2Sat_mean"] == pytest.approx(96.0) assert third_sample["O2Sat_std"] == pytest.approx(1.0) assert third_sample["O2Sat_slope"] == pytest.approx(1.0) assert third_sample["O2Sat_missing_rate"] == pytest.approx(1 / 3) assert third_sample["global_missing_rate"] == pytest.approx(1 / 6) assert third_sample[TARGET_COLUMN] == 1 def test_fit_transform_uses_training_medians_for_numeric_feature_imputation() -> None: raw_features = pd.DataFrame( { "patient_id": ["p1", "p2", "p3"], "sample_index": [0, 1, 2], TARGET_COLUMN: [0, 1, 0], "global_missing_rate": [0.0, 0.5, 1.0], "HR_last": [80.0, 100.0, None], "O2Sat_mean": [95.0, None, None], } ) featurizer = TabularFeaturizer(dynamic_columns=["HR", "O2Sat"], lookback_hours=3) transformed = featurizer.fit_transform(raw_features) assert featurizer.feature_columns_ == ["HR_last", "O2Sat_mean"] assert featurizer.medians_["HR_last"] == pytest.approx(90.0) assert featurizer.medians_["O2Sat_mean"] == pytest.approx(95.0) assert transformed["HR_last"].tolist() == [80.0, 100.0, 90.0] assert transformed["O2Sat_mean"].tolist() == [95.0, 95.0, 95.0]