AutoML / utils /data_cleaner.py
Al1Abdullah's picture
Initial commit of AutoML project
aa68823
import pandas as pd
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.impute import SimpleImputer
import logging
# Configure logging for this module
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
def clean_data(df):
"""Cleans the input DataFrame by imputing missing values.
- Numerical columns: Imputes missing values with the mean.
- Categorical columns: Imputes missing values with the most frequent value.
Args:
df (pd.DataFrame): The input DataFrame to clean.
Returns:
pd.DataFrame: The DataFrame with missing values imputed.
"""
logging.info("Starting data cleaning process.")
# Impute missing values for numerical columns
numerical_cols = df.select_dtypes(include=['number']).columns
if not numerical_cols.empty:
logging.info(f"Imputing missing numerical values for columns: {list(numerical_cols)}")
imputer_numerical = SimpleImputer(strategy='mean')
df[numerical_cols] = imputer_numerical.fit_transform(df[numerical_cols])
# Impute missing values for categorical columns
categorical_cols = df.select_dtypes(include=['object', 'category']).columns
if not categorical_cols.empty:
logging.info(f"Imputing missing categorical values for columns: {list(categorical_cols)}")
imputer_categorical = SimpleImputer(strategy='most_frequent')
df[categorical_cols] = imputer_categorical.fit_transform(df[categorical_cols])
logging.info("Data cleaning process completed.")
return df
def prepare_data(df, target_column=None):
"""Prepares the DataFrame for machine learning by cleaning, encoding, and scaling.
Args:
df (pd.DataFrame): The input DataFrame.
target_column (str, optional): The name of the target column. If provided,
data is prepared for supervised learning (X, y split).
Otherwise, for unsupervised learning (all features).
Returns:
tuple: If target_column is provided:
(X (pd.DataFrame), y (pd.Series), label_encoders (dict), is_classification (bool))
If target_column is None:
(df_prepared (pd.DataFrame), label_encoders (dict))
"""
logging.info(f"Starting data preparation process. Target column: {target_column}")
df = clean_data(df.copy()) # Ensure we work on a copy to avoid modifying original df
label_encoders = {}
is_classification = False
# Encode categorical features (excluding the target column if it's categorical)
for col in df.select_dtypes(include=['object', 'category']).columns:
if col != target_column:
logging.info(f"Encoding categorical feature: {col}")
le = LabelEncoder()
df[col] = le.fit_transform(df[col])
label_encoders[col] = le
if target_column:
# Supervised learning preparation
logging.info(f"Preparing data for supervised learning with target: {target_column}")
# Determine if it's a classification or regression task based on target column properties
if df[target_column].dtype == 'object' or df[target_column].nunique() <= 10: # Heuristic for classification
is_classification = True
logging.info(f"Target column '{target_column}' identified as classification.")
le = LabelEncoder()
df[target_column] = le.fit_transform(df[target_column])
label_encoders[target_column] = le
else:
logging.info(f"Target column '{target_column}' identified as regression.")
X = df.drop(columns=[target_column])
y = df[target_column]
# Scale numerical features in X
numerical_cols = X.select_dtypes(include=['number']).columns
if not numerical_cols.empty:
logging.info(f"Scaling numerical features in X: {list(numerical_cols)}")
scaler = StandardScaler()
X[numerical_cols] = scaler.fit_transform(X[numerical_cols])
logging.info("Data preparation for supervised learning completed.")
return X, y, label_encoders, is_classification
else:
# Unsupervised learning preparation (scale all numerical features)
logging.info("Preparing data for unsupervised learning.")
numerical_cols = df.select_dtypes(include=['number']).columns
if not numerical_cols.empty:
logging.info(f"Scaling numerical features for unsupervised learning: {list(numerical_cols)}")
scaler = StandardScaler()
df[numerical_cols] = scaler.fit_transform(df[numerical_cols])
logging.info("Data preparation for unsupervised learning completed.")
return df.copy(), label_encoders