saherPervaiz commited on
Commit
067c46a
·
verified ·
1 Parent(s): afbaf89

Upload data_cleaning.py

Browse files
Files changed (1) hide show
  1. utils/data_cleaning.py +31 -0
utils/data_cleaning.py ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import numpy as np
3
+
4
+ # Function to handle missing values
5
+ def handle_missing_values(df, method="drop"):
6
+ if method == "drop":
7
+ df = df.dropna()
8
+ elif method == "fill":
9
+ for col in df.columns:
10
+ if df[col].dtype in ['float64', 'int64']:
11
+ df[col].fillna(df[col].mean(), inplace=True)
12
+ else:
13
+ df[col].fillna(df[col].mode()[0], inplace=True)
14
+ return df
15
+
16
+ # Function to remove outliers using the IQR method
17
+ def remove_outliers_iqr(df, column):
18
+ Q1 = df[column].quantile(0.25)
19
+ Q3 = df[column].quantile(0.75)
20
+ IQR = Q3 - Q1
21
+ lower_bound = Q1 - 1.5 * IQR
22
+ upper_bound = Q3 + 1.5 * IQR
23
+ return df[(df[column] >= lower_bound) & (df[column] <= upper_bound)]
24
+
25
+ # Function to cap extreme values (5th and 95th percentiles)
26
+ def cap_extreme_values(df):
27
+ for col in df.select_dtypes(include=[np.number]).columns:
28
+ lower_limit = df[col].quantile(0.05)
29
+ upper_limit = df[col].quantile(0.95)
30
+ df[col] = np.clip(df[col], lower_limit, upper_limit)
31
+ return df