File size: 1,650 Bytes
f38abcf
3a1ca73
f2b53e5
f38abcf
2fecf67
23a6907
 
 
 
98aea7e
 
 
 
 
23a6907
f38abcf
23a6907
 
 
 
 
 
 
 
14f03aa
23a6907
 
 
f04fef5
23a6907
 
 
 
 
 
 
 
 
f2b53e5
23a6907
 
 
 
 
 
 
2fecf67
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder

def preprocess_data(df):
    """
    Preprocess the dataset by handling missing values and performing any necessary cleanup.
    """
    # Example: Handling missing values by filling with column mean
    for col in df.columns:
        if df[col].dtype in ['float64', 'int64']:
            df[col].fillna(df[col].mean(), inplace=True)
        else:
            df[col].fillna(df[col].mode()[0], inplace=True)
    return df

def remove_outliers_iqr(df):
    """
    Remove outliers based on the IQR (Interquartile Range) method.
    """
    numeric_columns = df.select_dtypes(include=['float64', 'int64']).columns
    for col in numeric_columns:
        Q1 = df[col].quantile(0.25)
        Q3 = df[col].quantile(0.75)
        IQR = Q3 - Q1
        # Remove rows where values are outside of the IQR range for the column
        df = df[(df[col] >= (Q1 - 1.5 * IQR)) & (df[col] <= (Q3 + 1.5 * IQR))]
    return df

def cap_extreme_values(df):
    """
    Cap extreme values in the dataset beyond the 1st and 99th percentiles.
    """
    for col in df.select_dtypes(include=['float64', 'int64']).columns:
        upper_limit = df[col].quantile(0.99)
        lower_limit = df[col].quantile(0.01)
        df[col] = df[col].clip(lower=lower_limit, upper=upper_limit)
    return df

def convert_string_to_numeric(df):
    """
    Convert string categorical columns to numeric using Label Encoding.
    """
    label_encoder = LabelEncoder()
    for col in df.select_dtypes(include=['object']).columns:
        df[col] = label_encoder.fit_transform(df[col])
    return df