Spaces:
Sleeping
Sleeping
| import pandas as pd | |
| from sklearn.preprocessing import LabelEncoder, StandardScaler | |
| from sklearn.impute import SimpleImputer | |
| import logging | |
| # Configure logging for this module | |
| logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') | |
| def clean_data(df): | |
| """Cleans the input DataFrame by imputing missing values. | |
| - Numerical columns: Imputes missing values with the mean. | |
| - Categorical columns: Imputes missing values with the most frequent value. | |
| Args: | |
| df (pd.DataFrame): The input DataFrame to clean. | |
| Returns: | |
| pd.DataFrame: The DataFrame with missing values imputed. | |
| """ | |
| logging.info("Starting data cleaning process.") | |
| # Impute missing values for numerical columns | |
| numerical_cols = df.select_dtypes(include=['number']).columns | |
| if not numerical_cols.empty: | |
| logging.info(f"Imputing missing numerical values for columns: {list(numerical_cols)}") | |
| imputer_numerical = SimpleImputer(strategy='mean') | |
| df[numerical_cols] = imputer_numerical.fit_transform(df[numerical_cols]) | |
| # Impute missing values for categorical columns | |
| categorical_cols = df.select_dtypes(include=['object', 'category']).columns | |
| if not categorical_cols.empty: | |
| logging.info(f"Imputing missing categorical values for columns: {list(categorical_cols)}") | |
| imputer_categorical = SimpleImputer(strategy='most_frequent') | |
| df[categorical_cols] = imputer_categorical.fit_transform(df[categorical_cols]) | |
| logging.info("Data cleaning process completed.") | |
| return df | |
| def prepare_data(df, target_column=None): | |
| """Prepares the DataFrame for machine learning by cleaning, encoding, and scaling. | |
| Args: | |
| df (pd.DataFrame): The input DataFrame. | |
| target_column (str, optional): The name of the target column. If provided, | |
| data is prepared for supervised learning (X, y split). | |
| Otherwise, for unsupervised learning (all features). | |
| Returns: | |
| tuple: If target_column is provided: | |
| (X (pd.DataFrame), y (pd.Series), label_encoders (dict), is_classification (bool)) | |
| If target_column is None: | |
| (df_prepared (pd.DataFrame), label_encoders (dict)) | |
| """ | |
| logging.info(f"Starting data preparation process. Target column: {target_column}") | |
| df = clean_data(df.copy()) # Ensure we work on a copy to avoid modifying original df | |
| label_encoders = {} | |
| is_classification = False | |
| # Encode categorical features (excluding the target column if it's categorical) | |
| for col in df.select_dtypes(include=['object', 'category']).columns: | |
| if col != target_column: | |
| logging.info(f"Encoding categorical feature: {col}") | |
| le = LabelEncoder() | |
| df[col] = le.fit_transform(df[col]) | |
| label_encoders[col] = le | |
| if target_column: | |
| # Supervised learning preparation | |
| logging.info(f"Preparing data for supervised learning with target: {target_column}") | |
| # Determine if it's a classification or regression task based on target column properties | |
| if df[target_column].dtype == 'object' or df[target_column].nunique() <= 10: # Heuristic for classification | |
| is_classification = True | |
| logging.info(f"Target column '{target_column}' identified as classification.") | |
| le = LabelEncoder() | |
| df[target_column] = le.fit_transform(df[target_column]) | |
| label_encoders[target_column] = le | |
| else: | |
| logging.info(f"Target column '{target_column}' identified as regression.") | |
| X = df.drop(columns=[target_column]) | |
| y = df[target_column] | |
| # Scale numerical features in X | |
| numerical_cols = X.select_dtypes(include=['number']).columns | |
| if not numerical_cols.empty: | |
| logging.info(f"Scaling numerical features in X: {list(numerical_cols)}") | |
| scaler = StandardScaler() | |
| X[numerical_cols] = scaler.fit_transform(X[numerical_cols]) | |
| logging.info("Data preparation for supervised learning completed.") | |
| return X, y, label_encoders, is_classification | |
| else: | |
| # Unsupervised learning preparation (scale all numerical features) | |
| logging.info("Preparing data for unsupervised learning.") | |
| numerical_cols = df.select_dtypes(include=['number']).columns | |
| if not numerical_cols.empty: | |
| logging.info(f"Scaling numerical features for unsupervised learning: {list(numerical_cols)}") | |
| scaler = StandardScaler() | |
| df[numerical_cols] = scaler.fit_transform(df[numerical_cols]) | |
| logging.info("Data preparation for unsupervised learning completed.") | |
| return df.copy(), label_encoders | |