Spaces:
Sleeping
Sleeping
Update utils/data_cleaning.py
Browse files- utils/data_cleaning.py +3 -0
utils/data_cleaning.py
CHANGED
|
@@ -34,12 +34,15 @@ def remove_outliers_iqr(df):
|
|
| 34 |
"""
|
| 35 |
numerical_cols = df.select_dtypes(include=['int64', 'float64']).columns
|
| 36 |
for col in numerical_cols:
|
|
|
|
| 37 |
Q1 = df[col].quantile(0.25)
|
| 38 |
Q3 = df[col].quantile(0.75)
|
| 39 |
IQR = Q3 - Q1
|
| 40 |
lower_bound = Q1 - 1.5 * IQR
|
| 41 |
upper_bound = Q3 + 1.5 * IQR
|
| 42 |
df = df[(df[col] >= lower_bound) & (df[col] <= upper_bound)]
|
|
|
|
|
|
|
| 43 |
return df
|
| 44 |
|
| 45 |
def cap_extreme_values(df):
|
|
|
|
| 34 |
"""
|
| 35 |
numerical_cols = df.select_dtypes(include=['int64', 'float64']).columns
|
| 36 |
for col in numerical_cols:
|
| 37 |
+
original_count = len(df)
|
| 38 |
Q1 = df[col].quantile(0.25)
|
| 39 |
Q3 = df[col].quantile(0.75)
|
| 40 |
IQR = Q3 - Q1
|
| 41 |
lower_bound = Q1 - 1.5 * IQR
|
| 42 |
upper_bound = Q3 + 1.5 * IQR
|
| 43 |
df = df[(df[col] >= lower_bound) & (df[col] <= upper_bound)]
|
| 44 |
+
removed_rows = original_count - len(df)
|
| 45 |
+
print(f"Removed outliers from **{col}**: {removed_rows} rows removed.")
|
| 46 |
return df
|
| 47 |
|
| 48 |
def cap_extreme_values(df):
|