import joblib from loguru import logger import pandas as pd from predicting_outcomes_in_heart_failure.config import ( FEMALE_CSV, INTERIM_DATA_DIR, MALE_CSV, NOSEX_CSV, NUM_COLS_DEFAULT, PREPROCESS_ARTIFACTS_DIR, PREPROCESSED_CSV, RAW_PATH, SCALER_PATH, TARGET_COL, ) from sklearn.preprocessing import StandardScaler def save_scaler_artifact(scaler: StandardScaler): """Save only the fitted scaler used during preprocessing for inference.""" PREPROCESS_ARTIFACTS_DIR.mkdir(parents=True, exist_ok=True) joblib.dump(scaler, SCALER_PATH) logger.success(f"Saved StandardScaler to {SCALER_PATH}") def generate_gender_splits(df: pd.DataFrame): """Create and save gender-based CSV splits (female, male, nosex).""" if "Sex" in df.columns: df_female = df[df["Sex"] == 0].copy() df_female.to_csv(FEMALE_CSV, index=False) logger.success(f"Saved female-only dataset: {FEMALE_CSV} (rows={len(df_female)})") if "Sex" in df.columns: df_male = df[df["Sex"] == 1].copy() df_male.to_csv(MALE_CSV, index=False) logger.success(f"Saved male-only dataset: {MALE_CSV} (rows={len(df_male)})") df_nosex = df.drop(columns=["Sex"], errors="ignore").copy() df_nosex.to_csv(NOSEX_CSV, index=False) logger.success(f"Saved dataset without 'Sex': {NOSEX_CSV} (rows={len(df_nosex)})") def preprocessing(): """Run the full preprocessing pipeline on the raw heart dataset.""" logger.info("Starting preprocessing pipeline...") if not RAW_PATH.exists(): logger.error(f"Missing {RAW_PATH}. Put heart.csv under data/raw/ or adjust RAW_PATH.") raise FileNotFoundError(f"Missing {RAW_PATH}.") df = pd.read_csv(RAW_PATH) logger.info(f"Loaded dataset: {RAW_PATH} (rows={len(df)}, cols={df.shape[1]})") if len(df) < 2: raise ValueError("Preprocessing requires at least 2 rows, got only 1.") # Ensure target is integer df[TARGET_COL] = df[TARGET_COL].astype(int) # Remove invalid RestingBP rows if "RestingBP" in df.columns: before = len(df) df = df[df["RestingBP"] != 0].reset_index(drop=True) removed = before - len(df) if removed > 0: logger.warning(f"Removed {removed} rows with RestingBP == 0") # Impute missing/zero Cholesterol if "Cholesterol" in df.columns: zero_mask = df["Cholesterol"] == 0 if zero_mask.any(): median_chol = df.loc[~zero_mask, "Cholesterol"].median() df.loc[zero_mask, "Cholesterol"] = median_chol logger.info(f"Imputed {zero_mask.sum()} Cholesterol==0 with median={median_chol}") # Encode binary categorical features if "Sex" in df.columns: df["Sex"] = df["Sex"].map({"M": 1, "F": 0}).astype(int) logger.debug("Encoded 'Sex' as binary.") if "ExerciseAngina" in df.columns: df["ExerciseAngina"] = df["ExerciseAngina"].map({"Y": 1, "N": 0}).astype(int) logger.debug("Encoded 'ExerciseAngina' as binary.") # One-hot encode multi-category features multi_cat = [c for c in ["ChestPainType", "RestingECG", "ST_Slope"] if c in df.columns] df = pd.get_dummies(df, columns=multi_cat, drop_first=False) logger.debug(f"One-hot encoded columns: {multi_cat}") # Scale numerical columns num_cols = [c for c in NUM_COLS_DEFAULT if c in df.columns and c != TARGET_COL] scaler = StandardScaler() df[num_cols] = scaler.fit_transform(df[num_cols]) logger.info(f"Scaled numerical features: {num_cols}") # Save processed dataset df.to_csv(PREPROCESSED_CSV, index=False) logger.success( "Saved preprocessed dataset: %s (rows=%d, cols=%d)", PREPROCESSED_CSV, len(df), df.shape[1] ) # Log class distribution count_0 = (df[TARGET_COL] == 0).sum() count_1 = (df[TARGET_COL] == 1).sum() logger.info(f"Target balance — 0: {count_0} | 1: {count_1}") save_scaler_artifact(scaler) logger.success("Preprocessing completed successfully.") return df if __name__ == "__main__": INTERIM_DATA_DIR.mkdir(parents=True, exist_ok=True) df_processed = preprocessing() generate_gender_splits(df_processed)