Spaces:
Sleeping
Sleeping
File size: 4,845 Bytes
aa68823 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 |
import pandas as pd
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.impute import SimpleImputer
import logging
# Configure logging for this module
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
def clean_data(df):
"""Cleans the input DataFrame by imputing missing values.
- Numerical columns: Imputes missing values with the mean.
- Categorical columns: Imputes missing values with the most frequent value.
Args:
df (pd.DataFrame): The input DataFrame to clean.
Returns:
pd.DataFrame: The DataFrame with missing values imputed.
"""
logging.info("Starting data cleaning process.")
# Impute missing values for numerical columns
numerical_cols = df.select_dtypes(include=['number']).columns
if not numerical_cols.empty:
logging.info(f"Imputing missing numerical values for columns: {list(numerical_cols)}")
imputer_numerical = SimpleImputer(strategy='mean')
df[numerical_cols] = imputer_numerical.fit_transform(df[numerical_cols])
# Impute missing values for categorical columns
categorical_cols = df.select_dtypes(include=['object', 'category']).columns
if not categorical_cols.empty:
logging.info(f"Imputing missing categorical values for columns: {list(categorical_cols)}")
imputer_categorical = SimpleImputer(strategy='most_frequent')
df[categorical_cols] = imputer_categorical.fit_transform(df[categorical_cols])
logging.info("Data cleaning process completed.")
return df
def prepare_data(df, target_column=None):
"""Prepares the DataFrame for machine learning by cleaning, encoding, and scaling.
Args:
df (pd.DataFrame): The input DataFrame.
target_column (str, optional): The name of the target column. If provided,
data is prepared for supervised learning (X, y split).
Otherwise, for unsupervised learning (all features).
Returns:
tuple: If target_column is provided:
(X (pd.DataFrame), y (pd.Series), label_encoders (dict), is_classification (bool))
If target_column is None:
(df_prepared (pd.DataFrame), label_encoders (dict))
"""
logging.info(f"Starting data preparation process. Target column: {target_column}")
df = clean_data(df.copy()) # Ensure we work on a copy to avoid modifying original df
label_encoders = {}
is_classification = False
# Encode categorical features (excluding the target column if it's categorical)
for col in df.select_dtypes(include=['object', 'category']).columns:
if col != target_column:
logging.info(f"Encoding categorical feature: {col}")
le = LabelEncoder()
df[col] = le.fit_transform(df[col])
label_encoders[col] = le
if target_column:
# Supervised learning preparation
logging.info(f"Preparing data for supervised learning with target: {target_column}")
# Determine if it's a classification or regression task based on target column properties
if df[target_column].dtype == 'object' or df[target_column].nunique() <= 10: # Heuristic for classification
is_classification = True
logging.info(f"Target column '{target_column}' identified as classification.")
le = LabelEncoder()
df[target_column] = le.fit_transform(df[target_column])
label_encoders[target_column] = le
else:
logging.info(f"Target column '{target_column}' identified as regression.")
X = df.drop(columns=[target_column])
y = df[target_column]
# Scale numerical features in X
numerical_cols = X.select_dtypes(include=['number']).columns
if not numerical_cols.empty:
logging.info(f"Scaling numerical features in X: {list(numerical_cols)}")
scaler = StandardScaler()
X[numerical_cols] = scaler.fit_transform(X[numerical_cols])
logging.info("Data preparation for supervised learning completed.")
return X, y, label_encoders, is_classification
else:
# Unsupervised learning preparation (scale all numerical features)
logging.info("Preparing data for unsupervised learning.")
numerical_cols = df.select_dtypes(include=['number']).columns
if not numerical_cols.empty:
logging.info(f"Scaling numerical features for unsupervised learning: {list(numerical_cols)}")
scaler = StandardScaler()
df[numerical_cols] = scaler.fit_transform(df[numerical_cols])
logging.info("Data preparation for unsupervised learning completed.")
return df.copy(), label_encoders
|