CardioTrackTest

Sleeping

File size: 7,469 Bytes

f7d11f7

from io import StringIO
from types import SimpleNamespace

import joblib
import pandas as pd
from predicting_outcomes_in_heart_failure.config import MODELS_DIR, TARGET_COL
from predicting_outcomes_in_heart_failure.modeling import evaluate
import pytest

TEST_PROCESSED_CSV = (
    "Age,Sex,RestingBP,Cholesterol,FastingBS,MaxHR,ExerciseAngina,Oldpeak,HeartDisease,"
    "ChestPainType_ASY,ChestPainType_ATA,ChestPainType_NAP,ChestPainType_TA,"
    "RestingECG_LVH,RestingECG_Normal,RestingECG_ST,ST_Slope_Down,ST_Slope_Flat,ST_Slope_Up\n"
    "-1.4322063372940435,1,0.41462668821399407,0.8574469341726604,0,1.3833394263306962,"
    "0,-0.8315022488659315,0,False,True,False,False,False,True,False,False,False,True\n"
    "-0.47805724933087407,0,1.5263596504719819,-1.183717051045972,0,0.7547357326016333,"
    "0,0.10625148648034725,1,False,False,True,False,False,True,False,False,True,False\n"
    "-1.7502560332817665,1,-0.14123979291499986,0.7450892836101669,0,-1.5239526571662194,"
    "0,-0.8315022488659315,0,False,True,False,False,False,False,True,False,False,True\n"
    "-0.5840738146601151,0,0.30345339198819526,-0.5470236978585087,0,-1.1310753485855551,"
    "1,0.5751283541534866,1,True,False,False,False,False,True,False,False,True,False\n"
    "0.05202557731533111,1,0.970493169342988,-0.9028229246397381,0,-0.5810471165726252,"
    "0,-0.8315022488659315,0,False,False,True,False,False,True,False,False,False,True\n"
    "-1.5382229026232843,1,-0.6971062740439938,1.7937606888601065,0,1.3047639646145632,"
    "0,-0.8315022488659315,0,False,False,True,False,False,True,False,False,False,True\n"
    "-0.9021235106478382,0,-0.14123979291499986,-0.11631937070228347,0,1.3047639646145632,"
    "0,-0.8315022488659315,0,False,True,False,False,False,True,False,False,False,True\n"
    "0.05202557731533111,1,-1.2529727551729877,-0.6593813484210022,0,0.2047075005887033,"
    "0,-0.8315022488659315,0,False,True,False,False,False,True,False,False,False,True\n"
    "-1.7502560332817665,1,0.41462668821399407,-0.6781076235147511,0,-0.2667452697080938,"
    "1,0.5751283541534866,1,True,False,False,False,False,True,False,False,True,False\n"
)


@pytest.fixture
def processed_df():
    df = pd.read_csv(StringIO(TEST_PROCESSED_CSV))
    return df


@pytest.fixture
def definition_X_test_and_y_test(processed_df):
    df = processed_df
    X_test = df.drop(columns=[TARGET_COL])
    y_test = df[TARGET_COL]

    return X_test, y_test


@pytest.fixture
def logreg_model():
    path = MODELS_DIR / "all" / "logreg.joblib"
    model = joblib.load(path)
    return model


@pytest.fixture
def decision_tree_model():
    path = MODELS_DIR / "all" / "decision_tree.joblib"
    return joblib.load(path)


@pytest.fixture
def random_forest_model():
    path = MODELS_DIR / "all" / "random_forest.joblib"
    return joblib.load(path)


@pytest.fixture
def sample_raw_df_single():
    """
    Single-row raw sample, similar to the one used in main().
    Used to test overall preprocessing shape / columns / NaNs.
    """
    return pd.DataFrame(
        {
            "Age": [54],
            "Sex": ["F"],
            "ChestPainType": ["ASY"],
            "RestingBP": [140],
            "Cholesterol": [239],
            "FastingBS": [0],
            "RestingECG": ["Normal"],
            "MaxHR": [160],
            "ExerciseAngina": ["N"],
            "Oldpeak": [0.0],
            "ST_Slope": ["Up"],
        }
    )


@pytest.fixture
def sample_raw_df_two_rows():
    """
    Two-row raw sample with variation in categorical features,
    used to test binary encodings and one-hot encoding.
    """
    return pd.DataFrame(
        {
            "Age": [50, 60],
            "Sex": ["M", "F"],
            "ChestPainType": ["ASY", "NAP"],
            "RestingBP": [130, 140],
            "Cholesterol": [220, 250],
            "FastingBS": [0, 1],
            "RestingECG": ["Normal", "ST"],
            "MaxHR": [150, 140],
            "ExerciseAngina": ["Y", "N"],
            "Oldpeak": [1.0, 0.0],
            "ST_Slope": ["Up", "Flat"],
        }
    )


@pytest.fixture
def sample_raw_df_only_asy_up():
    """
    Three-row raw sample where:
      - ChestPainType is always ASY
      - RestingECG is always Normal
      - ST_Slope is always Up

    Used to test that missing dummy columns (NAP, TA, ST, LVH, Flat, Down)
    are still present and filled with zeros.
    """
    return pd.DataFrame(
        {
            "Age": [50, 60, 55],
            "Sex": ["M", "F", "M"],
            "ChestPainType": ["ASY", "ASY", "ASY"],
            "RestingBP": [130, 140, 135],
            "Cholesterol": [220, 250, 230],
            "FastingBS": [0, 1, 0],
            "RestingECG": ["Normal", "Normal", "Normal"],
            "MaxHR": [150, 140, 145],
            "ExerciseAngina": ["Y", "N", "Y"],
            "Oldpeak": [1.0, 0.0, 0.5],
            "ST_Slope": ["Up", "Up", "Up"],
        }
    )


@pytest.fixture
def dummy_logger(monkeypatch):
    class DummyLogger:
        def __init__(self):
            self.warnings = []
            self.infos = []
            self.errors = []
            self.successes = []

        def warning(self, msg):
            self.warnings.append(msg)

        def info(self, msg):
            self.infos.append(msg)

        def error(self, msg):
            self.errors.append(msg)

        def success(self, msg):
            self.successes.append(msg)

    logger = DummyLogger()
    monkeypatch.setattr(evaluate, "logger", logger)
    return logger


@pytest.fixture
def mlflow_no_runs(monkeypatch):
    class DummyMlflow:
        called_search_runs = 0

        class data:
            @staticmethod
            def from_pandas(*args, **kwargs):
                pytest.fail("mlflow.data.from_pandas should not be called when there are no runs")

        class sklearn:
            @staticmethod
            def log_model(*args, **kwargs):
                pytest.fail("mlflow.sklearn.log_model should not be called when there are no runs")

        @staticmethod
        def get_experiment_by_name(name):
            # we SImulate an empty experiment
            return SimpleNamespace(experiment_id="exp-123")

        @staticmethod
        def search_runs(experiment_ids, filter_string, order_by, max_results):
            DummyMlflow.called_search_runs += 1
            # Empty DataFrame → runs.empty == True
            return pd.DataFrame()

        @staticmethod
        def start_run(run_id):
            pytest.fail("mlflow.start_run should not be called when there are no runs")

        @staticmethod
        def log_input(*args, **kwargs):
            pytest.fail("mlflow.log_input should not be called when there are no runs")

        @staticmethod
        def log_metrics(*args, **kwargs):
            pytest.fail("mlflow.log_metrics should not be called when there are no runs")

    monkeypatch.setattr(evaluate, "mlflow", DummyMlflow)
    return DummyMlflow


@pytest.fixture
def mlflow_experiment_missing(monkeypatch):
    class DummyMlflow:
        called_get_experiment = 0
        called_search_runs = 0

        @staticmethod
        def get_experiment_by_name(name):
            DummyMlflow.called_get_experiment += 1
            return None  # not founded experiment

        @staticmethod
        def search_runs(*args, **kwargs):
            DummyMlflow.called_search_runs += 1
            return pd.DataFrame()

    monkeypatch.setattr(evaluate, "mlflow", DummyMlflow)
    return DummyMlflow