Spaces:
Sleeping
Sleeping
Update utils/data_cleaning.py
Browse files- utils/data_cleaning.py +10 -9
utils/data_cleaning.py
CHANGED
|
@@ -22,23 +22,24 @@ def handle_missing_values(df, method='Drop rows'):
|
|
| 22 |
df[col].fillna(df[col].mode()[0], inplace=True)
|
| 23 |
return df
|
| 24 |
|
| 25 |
-
def remove_outliers_iqr(df
|
| 26 |
"""
|
| 27 |
-
Remove outliers using the IQR (Interquartile Range) method for
|
| 28 |
|
| 29 |
Parameters:
|
| 30 |
- df: The input DataFrame.
|
| 31 |
-
- col: The specific column to remove outliers from.
|
| 32 |
|
| 33 |
Returns:
|
| 34 |
- df: The DataFrame after removing outliers.
|
| 35 |
"""
|
| 36 |
-
|
| 37 |
-
|
| 38 |
-
|
| 39 |
-
|
| 40 |
-
|
| 41 |
-
|
|
|
|
|
|
|
| 42 |
return df
|
| 43 |
|
| 44 |
def cap_extreme_values(df):
|
|
|
|
| 22 |
df[col].fillna(df[col].mode()[0], inplace=True)
|
| 23 |
return df
|
| 24 |
|
| 25 |
+
def remove_outliers_iqr(df):
|
| 26 |
"""
|
| 27 |
+
Remove outliers using the IQR (Interquartile Range) method for all numerical columns.
|
| 28 |
|
| 29 |
Parameters:
|
| 30 |
- df: The input DataFrame.
|
|
|
|
| 31 |
|
| 32 |
Returns:
|
| 33 |
- df: The DataFrame after removing outliers.
|
| 34 |
"""
|
| 35 |
+
numerical_cols = df.select_dtypes(include=['int64', 'float64']).columns
|
| 36 |
+
for col in numerical_cols:
|
| 37 |
+
Q1 = df[col].quantile(0.25)
|
| 38 |
+
Q3 = df[col].quantile(0.75)
|
| 39 |
+
IQR = Q3 - Q1
|
| 40 |
+
lower_bound = Q1 - 1.5 * IQR
|
| 41 |
+
upper_bound = Q3 + 1.5 * IQR
|
| 42 |
+
df = df[(df[col] >= lower_bound) & (df[col] <= upper_bound)]
|
| 43 |
return df
|
| 44 |
|
| 45 |
def cap_extreme_values(df):
|