File size: 4,845 Bytes
aa68823
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
import pandas as pd
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.impute import SimpleImputer
import logging

# Configure logging for this module
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

def clean_data(df):
    """Cleans the input DataFrame by imputing missing values.

    - Numerical columns: Imputes missing values with the mean.
    - Categorical columns: Imputes missing values with the most frequent value.

    Args:
        df (pd.DataFrame): The input DataFrame to clean.

    Returns:
        pd.DataFrame: The DataFrame with missing values imputed.
    """
    logging.info("Starting data cleaning process.")
    # Impute missing values for numerical columns
    numerical_cols = df.select_dtypes(include=['number']).columns
    if not numerical_cols.empty:
        logging.info(f"Imputing missing numerical values for columns: {list(numerical_cols)}")
        imputer_numerical = SimpleImputer(strategy='mean')
        df[numerical_cols] = imputer_numerical.fit_transform(df[numerical_cols])

    # Impute missing values for categorical columns
    categorical_cols = df.select_dtypes(include=['object', 'category']).columns
    if not categorical_cols.empty:
        logging.info(f"Imputing missing categorical values for columns: {list(categorical_cols)}")
        imputer_categorical = SimpleImputer(strategy='most_frequent')
        df[categorical_cols] = imputer_categorical.fit_transform(df[categorical_cols])
    
    logging.info("Data cleaning process completed.")
    return df

def prepare_data(df, target_column=None):
    """Prepares the DataFrame for machine learning by cleaning, encoding, and scaling.

    Args:
        df (pd.DataFrame): The input DataFrame.
        target_column (str, optional): The name of the target column. If provided,
                                       data is prepared for supervised learning (X, y split).
                                       Otherwise, for unsupervised learning (all features).

    Returns:
        tuple: If target_column is provided:
                   (X (pd.DataFrame), y (pd.Series), label_encoders (dict), is_classification (bool))
               If target_column is None:
                   (df_prepared (pd.DataFrame), label_encoders (dict))
    """
    logging.info(f"Starting data preparation process. Target column: {target_column}")
    df = clean_data(df.copy()) # Ensure we work on a copy to avoid modifying original df
    label_encoders = {}
    is_classification = False

    # Encode categorical features (excluding the target column if it's categorical)
    for col in df.select_dtypes(include=['object', 'category']).columns:
        if col != target_column:
            logging.info(f"Encoding categorical feature: {col}")
            le = LabelEncoder()
            df[col] = le.fit_transform(df[col])
            label_encoders[col] = le

    if target_column:
        # Supervised learning preparation
        logging.info(f"Preparing data for supervised learning with target: {target_column}")
        # Determine if it's a classification or regression task based on target column properties
        if df[target_column].dtype == 'object' or df[target_column].nunique() <= 10: # Heuristic for classification
            is_classification = True
            logging.info(f"Target column '{target_column}' identified as classification.")
            le = LabelEncoder()
            df[target_column] = le.fit_transform(df[target_column])
            label_encoders[target_column] = le
        else:
            logging.info(f"Target column '{target_column}' identified as regression.")
        
        X = df.drop(columns=[target_column])
        y = df[target_column]
        
        # Scale numerical features in X
        numerical_cols = X.select_dtypes(include=['number']).columns
        if not numerical_cols.empty:
            logging.info(f"Scaling numerical features in X: {list(numerical_cols)}")
            scaler = StandardScaler()
            X[numerical_cols] = scaler.fit_transform(X[numerical_cols])
            
        logging.info("Data preparation for supervised learning completed.")
        return X, y, label_encoders, is_classification
    else:
        # Unsupervised learning preparation (scale all numerical features)
        logging.info("Preparing data for unsupervised learning.")
        numerical_cols = df.select_dtypes(include=['number']).columns
        if not numerical_cols.empty:
            logging.info(f"Scaling numerical features for unsupervised learning: {list(numerical_cols)}")
            scaler = StandardScaler()
            df[numerical_cols] = scaler.fit_transform(df[numerical_cols])
        logging.info("Data preparation for unsupervised learning completed.")
        return df.copy(), label_encoders