anonymous0523ly
/

misscp

Model card Files Files and versions

misscp / tests /test_features.py

Anonymous

Initial anonymous MissCP release

32f5a65 24 days ago

history blame contribute delete

2.97 kB

	from __future__ import annotations

	from pathlib import Path

	import pandas as pd
	import pytest

	from sepsis_mcp.constants import TARGET_COLUMN
	from sepsis_mcp.dataset import build_labeled_patient_frame
	from sepsis_mcp.features import TabularFeaturizer
	from sepsis_mcp.io import load_patient_frame


	def _write_feature_patient(path: Path) -> Path:
	path.write_text(
	"\n".join(
	[
	"HR\|O2Sat\|Age\|Gender\|Unit1\|Unit2\|HospAdmTime\|ICULOS\|SepsisLabel",
	"80\|95\|65\|1\|1\|0\|-5\|1\|0",
	"82\|NaN\|65\|1\|1\|0\|-5\|2\|0",
	"84\|97\|65\|1\|1\|0\|-5\|3\|0",
	"88\|93\|65\|1\|1\|0\|-5\|4\|1",
	"90\|92\|65\|1\|1\|0\|-5\|5\|1",
	]
	),
	encoding="utf-8",
	)
	return path


	def test_build_patient_feature_frame_computes_window_statistics_and_missingness(
	tmp_path: Path,
	) -> None:
	patient_path = _write_feature_patient(tmp_path / "p000010.psv")
	patient_frame = load_patient_frame(patient_path)
	labeled = build_labeled_patient_frame(patient_frame, patient_id="p000010", horizon_hours=2)
	featurizer = TabularFeaturizer(dynamic_columns=["HR", "O2Sat"], lookback_hours=3)

	raw_features = featurizer.build_patient_feature_frame(patient_frame, labeled)
	third_sample = raw_features.loc[raw_features["sample_index"] == 2].iloc[0]

	assert third_sample["HR_last"] == pytest.approx(84.0)
	assert third_sample["HR_mean"] == pytest.approx(82.0)
	assert third_sample["HR_std"] == pytest.approx(1.632993161855452)
	assert third_sample["HR_slope"] == pytest.approx(2.0)
	assert third_sample["HR_missing_rate"] == pytest.approx(0.0)
	assert third_sample["O2Sat_last"] == pytest.approx(97.0)
	assert third_sample["O2Sat_mean"] == pytest.approx(96.0)
	assert third_sample["O2Sat_std"] == pytest.approx(1.0)
	assert third_sample["O2Sat_slope"] == pytest.approx(1.0)
	assert third_sample["O2Sat_missing_rate"] == pytest.approx(1 / 3)
	assert third_sample["global_missing_rate"] == pytest.approx(1 / 6)
	assert third_sample[TARGET_COLUMN] == 1


	def test_fit_transform_uses_training_medians_for_numeric_feature_imputation() -> None:
	raw_features = pd.DataFrame(
	{
	"patient_id": ["p1", "p2", "p3"],
	"sample_index": [0, 1, 2],
	TARGET_COLUMN: [0, 1, 0],
	"global_missing_rate": [0.0, 0.5, 1.0],
	"HR_last": [80.0, 100.0, None],
	"O2Sat_mean": [95.0, None, None],
	}
	)
	featurizer = TabularFeaturizer(dynamic_columns=["HR", "O2Sat"], lookback_hours=3)

	transformed = featurizer.fit_transform(raw_features)

	assert featurizer.feature_columns_ == ["HR_last", "O2Sat_mean"]
	assert featurizer.medians_["HR_last"] == pytest.approx(90.0)
	assert featurizer.medians_["O2Sat_mean"] == pytest.approx(95.0)
	assert transformed["HR_last"].tolist() == [80.0, 100.0, 90.0]
	assert transformed["O2Sat_mean"].tolist() == [95.0, 95.0, 95.0]