Spaces:
Sleeping
Sleeping
| import pandas as pd | |
| import numpy as np | |
| from sklearn.preprocessing import LabelEncoder | |
| def preprocess_data(df): | |
| """ | |
| Preprocess the dataset by handling missing values and performing any necessary cleanup. | |
| """ | |
| # Example: Handling missing values by filling with column mean | |
| for col in df.columns: | |
| if df[col].dtype in ['float64', 'int64']: | |
| df[col].fillna(df[col].mean(), inplace=True) | |
| else: | |
| df[col].fillna(df[col].mode()[0], inplace=True) | |
| return df | |
| def remove_outliers_iqr(df): | |
| """ | |
| Remove outliers based on the IQR (Interquartile Range) method. | |
| """ | |
| numeric_columns = df.select_dtypes(include=['float64', 'int64']).columns | |
| for col in numeric_columns: | |
| Q1 = df[col].quantile(0.25) | |
| Q3 = df[col].quantile(0.75) | |
| IQR = Q3 - Q1 | |
| # Remove rows where values are outside of the IQR range for the column | |
| df = df[(df[col] >= (Q1 - 1.5 * IQR)) & (df[col] <= (Q3 + 1.5 * IQR))] | |
| return df | |
| def cap_extreme_values(df): | |
| """ | |
| Cap extreme values in the dataset beyond the 1st and 99th percentiles. | |
| """ | |
| for col in df.select_dtypes(include=['float64', 'int64']).columns: | |
| upper_limit = df[col].quantile(0.99) | |
| lower_limit = df[col].quantile(0.01) | |
| df[col] = df[col].clip(lower=lower_limit, upper=upper_limit) | |
| return df | |
| def convert_string_to_numeric(df): | |
| """ | |
| Convert string categorical columns to numeric using Label Encoding. | |
| """ | |
| label_encoder = LabelEncoder() | |
| for col in df.select_dtypes(include=['object']).columns: | |
| df[col] = label_encoder.fit_transform(df[col]) | |
| return df | |