File size: 1,418 Bytes
1573ecb
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler

class DataCleaner:
    def __init__(self):
        self.imputer = SimpleImputer(strategy='mean')
        self.scaler = StandardScaler()

    def clean(self, data):
        # Handle missing values
        data = self.handle_missing_values(data)

        # Remove outliers
        data = self.remove_outliers(data)

        # Normalize data
        data = self.normalize_data(data)

        return data

    def handle_missing_values(self, data):
        numeric_columns = data.select_dtypes(include=[np.number]).columns
        data[numeric_columns] = self.imputer.fit_transform(data[numeric_columns])
        return data

    def remove_outliers(self, data):
        numeric_columns = data.select_dtypes(include=[np.number]).columns
        for column in numeric_columns:
            Q1 = data[column].quantile(0.25)
            Q3 = data[column].quantile(0.75)
            IQR = Q3 - Q1
            lower_bound = Q1 - 1.5 * IQR
            upper_bound = Q3 + 1.5 * IQR
            data = data[(data[column] >= lower_bound) & (data[column] <= upper_bound)]
        return data

    def normalize_data(self, data):
        numeric_columns = data.select_dtypes(include=[np.number]).columns
        data[numeric_columns] = self.scaler.fit_transform(data[numeric_columns])
        return data