saherPervaiz commited on
Commit
f38abcf
·
verified ·
1 Parent(s): 488e1b9

Update utils/data_cleaning.py

Browse files
Files changed (1) hide show
  1. utils/data_cleaning.py +40 -31
utils/data_cleaning.py CHANGED
@@ -1,31 +1,40 @@
1
- import pandas as pd
2
- import numpy as np
3
-
4
- # Function to handle missing values
5
- def handle_missing_values(df, method="drop"):
6
- if method == "drop":
7
- df = df.dropna()
8
- elif method == "fill":
9
- for col in df.columns:
10
- if df[col].dtype in ['float64', 'int64']:
11
- df[col].fillna(df[col].mean(), inplace=True)
12
- else:
13
- df[col].fillna(df[col].mode()[0], inplace=True)
14
- return df
15
-
16
- # Function to remove outliers using the IQR method
17
- def remove_outliers_iqr(df, column):
18
- Q1 = df[column].quantile(0.25)
19
- Q3 = df[column].quantile(0.75)
20
- IQR = Q3 - Q1
21
- lower_bound = Q1 - 1.5 * IQR
22
- upper_bound = Q3 + 1.5 * IQR
23
- return df[(df[column] >= lower_bound) & (df[column] <= upper_bound)]
24
-
25
- # Function to cap extreme values (5th and 95th percentiles)
26
- def cap_extreme_values(df):
27
- for col in df.select_dtypes(include=[np.number]).columns:
28
- lower_limit = df[col].quantile(0.05)
29
- upper_limit = df[col].quantile(0.95)
30
- df[col] = np.clip(df[col], lower_limit, upper_limit)
31
- return df
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import numpy as np
3
+ from sklearn.preprocessing import LabelEncoder
4
+
5
+ def handle_missing_values(df: pd.DataFrame):
6
+ """Handle missing values in the dataframe."""
7
+ fill_method = "Fill with mean/median" # Can be dynamic (from user input)
8
+ if fill_method == "Drop rows":
9
+ df = df.dropna()
10
+ elif fill_method == "Fill with mean/median":
11
+ for col in df.columns:
12
+ if df[col].dtype in ['float64', 'int64']:
13
+ df[col].fillna(df[col].mean(), inplace=True)
14
+ else:
15
+ df[col].fillna(df[col].mode()[0], inplace=True)
16
+ return df
17
+
18
+ def remove_outliers_iqr(df: pd.DataFrame):
19
+ """Remove outliers using the IQR method."""
20
+ Q1 = df.quantile(0.25)
21
+ Q3 = df.quantile(0.75)
22
+ IQR = Q3 - Q1
23
+ df = df[~((df < (Q1 - 1.5 * IQR)) | (df > (Q3 + 1.5 * IQR))).any(axis=1)]
24
+ return df
25
+
26
+ def cap_extreme_values(df: pd.DataFrame):
27
+ """Cap extreme values using the 5th and 95th percentiles."""
28
+ for col in df.select_dtypes(include=[np.number]).columns:
29
+ lower_limit = df[col].quantile(0.05)
30
+ upper_limit = df[col].quantile(0.95)
31
+ df[col] = np.clip(df[col], lower_limit, upper_limit)
32
+ return df
33
+
34
+ def encode_categorical_data(df: pd.DataFrame):
35
+ """Encode categorical columns to numeric."""
36
+ label_encoder = LabelEncoder()
37
+ for col in df.columns:
38
+ if df[col].dtype == 'object' or len(df[col].unique()) <= 10:
39
+ df[col] = label_encoder.fit_transform(df[col])
40
+ return df