Depression / utils /data_cleaning.py
saherPervaiz's picture
Update utils/data_cleaning.py
98aea7e verified
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
def preprocess_data(df):
"""
Preprocess the dataset by handling missing values and performing any necessary cleanup.
"""
# Example: Handling missing values by filling with column mean
for col in df.columns:
if df[col].dtype in ['float64', 'int64']:
df[col].fillna(df[col].mean(), inplace=True)
else:
df[col].fillna(df[col].mode()[0], inplace=True)
return df
def remove_outliers_iqr(df):
"""
Remove outliers based on the IQR (Interquartile Range) method.
"""
numeric_columns = df.select_dtypes(include=['float64', 'int64']).columns
for col in numeric_columns:
Q1 = df[col].quantile(0.25)
Q3 = df[col].quantile(0.75)
IQR = Q3 - Q1
# Remove rows where values are outside of the IQR range for the column
df = df[(df[col] >= (Q1 - 1.5 * IQR)) & (df[col] <= (Q3 + 1.5 * IQR))]
return df
def cap_extreme_values(df):
"""
Cap extreme values in the dataset beyond the 1st and 99th percentiles.
"""
for col in df.select_dtypes(include=['float64', 'int64']).columns:
upper_limit = df[col].quantile(0.99)
lower_limit = df[col].quantile(0.01)
df[col] = df[col].clip(lower=lower_limit, upper=upper_limit)
return df
def convert_string_to_numeric(df):
"""
Convert string categorical columns to numeric using Label Encoding.
"""
label_encoder = LabelEncoder()
for col in df.select_dtypes(include=['object']).columns:
df[col] = label_encoder.fit_transform(df[col])
return df