Ashar086 commited on
Commit
1573ecb
·
verified ·
1 Parent(s): 79bdb89

Create data_cleaning.py

Browse files
Files changed (1) hide show
  1. data_cleaning.py +42 -0
data_cleaning.py ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import numpy as np
3
+ from sklearn.impute import SimpleImputer
4
+ from sklearn.preprocessing import StandardScaler
5
+
6
+ class DataCleaner:
7
+ def __init__(self):
8
+ self.imputer = SimpleImputer(strategy='mean')
9
+ self.scaler = StandardScaler()
10
+
11
+ def clean(self, data):
12
+ # Handle missing values
13
+ data = self.handle_missing_values(data)
14
+
15
+ # Remove outliers
16
+ data = self.remove_outliers(data)
17
+
18
+ # Normalize data
19
+ data = self.normalize_data(data)
20
+
21
+ return data
22
+
23
+ def handle_missing_values(self, data):
24
+ numeric_columns = data.select_dtypes(include=[np.number]).columns
25
+ data[numeric_columns] = self.imputer.fit_transform(data[numeric_columns])
26
+ return data
27
+
28
+ def remove_outliers(self, data):
29
+ numeric_columns = data.select_dtypes(include=[np.number]).columns
30
+ for column in numeric_columns:
31
+ Q1 = data[column].quantile(0.25)
32
+ Q3 = data[column].quantile(0.75)
33
+ IQR = Q3 - Q1
34
+ lower_bound = Q1 - 1.5 * IQR
35
+ upper_bound = Q3 + 1.5 * IQR
36
+ data = data[(data[column] >= lower_bound) & (data[column] <= upper_bound)]
37
+ return data
38
+
39
+ def normalize_data(self, data):
40
+ numeric_columns = data.select_dtypes(include=[np.number]).columns
41
+ data[numeric_columns] = self.scaler.fit_transform(data[numeric_columns])
42
+ return data