misscp / tests /test_features.py
Anonymous
Initial anonymous MissCP release
32f5a65
from __future__ import annotations
from pathlib import Path
import pandas as pd
import pytest
from sepsis_mcp.constants import TARGET_COLUMN
from sepsis_mcp.dataset import build_labeled_patient_frame
from sepsis_mcp.features import TabularFeaturizer
from sepsis_mcp.io import load_patient_frame
def _write_feature_patient(path: Path) -> Path:
path.write_text(
"\n".join(
[
"HR|O2Sat|Age|Gender|Unit1|Unit2|HospAdmTime|ICULOS|SepsisLabel",
"80|95|65|1|1|0|-5|1|0",
"82|NaN|65|1|1|0|-5|2|0",
"84|97|65|1|1|0|-5|3|0",
"88|93|65|1|1|0|-5|4|1",
"90|92|65|1|1|0|-5|5|1",
]
),
encoding="utf-8",
)
return path
def test_build_patient_feature_frame_computes_window_statistics_and_missingness(
tmp_path: Path,
) -> None:
patient_path = _write_feature_patient(tmp_path / "p000010.psv")
patient_frame = load_patient_frame(patient_path)
labeled = build_labeled_patient_frame(patient_frame, patient_id="p000010", horizon_hours=2)
featurizer = TabularFeaturizer(dynamic_columns=["HR", "O2Sat"], lookback_hours=3)
raw_features = featurizer.build_patient_feature_frame(patient_frame, labeled)
third_sample = raw_features.loc[raw_features["sample_index"] == 2].iloc[0]
assert third_sample["HR_last"] == pytest.approx(84.0)
assert third_sample["HR_mean"] == pytest.approx(82.0)
assert third_sample["HR_std"] == pytest.approx(1.632993161855452)
assert third_sample["HR_slope"] == pytest.approx(2.0)
assert third_sample["HR_missing_rate"] == pytest.approx(0.0)
assert third_sample["O2Sat_last"] == pytest.approx(97.0)
assert third_sample["O2Sat_mean"] == pytest.approx(96.0)
assert third_sample["O2Sat_std"] == pytest.approx(1.0)
assert third_sample["O2Sat_slope"] == pytest.approx(1.0)
assert third_sample["O2Sat_missing_rate"] == pytest.approx(1 / 3)
assert third_sample["global_missing_rate"] == pytest.approx(1 / 6)
assert third_sample[TARGET_COLUMN] == 1
def test_fit_transform_uses_training_medians_for_numeric_feature_imputation() -> None:
raw_features = pd.DataFrame(
{
"patient_id": ["p1", "p2", "p3"],
"sample_index": [0, 1, 2],
TARGET_COLUMN: [0, 1, 0],
"global_missing_rate": [0.0, 0.5, 1.0],
"HR_last": [80.0, 100.0, None],
"O2Sat_mean": [95.0, None, None],
}
)
featurizer = TabularFeaturizer(dynamic_columns=["HR", "O2Sat"], lookback_hours=3)
transformed = featurizer.fit_transform(raw_features)
assert featurizer.feature_columns_ == ["HR_last", "O2Sat_mean"]
assert featurizer.medians_["HR_last"] == pytest.approx(90.0)
assert featurizer.medians_["O2Sat_mean"] == pytest.approx(95.0)
assert transformed["HR_last"].tolist() == [80.0, 100.0, 90.0]
assert transformed["O2Sat_mean"].tolist() == [95.0, 95.0, 95.0]