Spaces:

saherPervaiz
/

Depression

Sleeping

File size: 1,650 Bytes

f38abcf
3a1ca73
f2b53e5
f38abcf
2fecf67
23a6907
 
 
 
98aea7e
 
 
 
 
23a6907
f38abcf
23a6907
 
 
 
 
 
 
 
14f03aa
23a6907
 
 
f04fef5
23a6907
 
 
 
 
 
 
 
 
f2b53e5
23a6907
 
 
 
 
 
 
2fecf67

import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder

def preprocess_data(df):
    """
    Preprocess the dataset by handling missing values and performing any necessary cleanup.
    """
    # Example: Handling missing values by filling with column mean
    for col in df.columns:
        if df[col].dtype in ['float64', 'int64']:
            df[col].fillna(df[col].mean(), inplace=True)
        else:
            df[col].fillna(df[col].mode()[0], inplace=True)
    return df

def remove_outliers_iqr(df):
    """
    Remove outliers based on the IQR (Interquartile Range) method.
    """
    numeric_columns = df.select_dtypes(include=['float64', 'int64']).columns
    for col in numeric_columns:
        Q1 = df[col].quantile(0.25)
        Q3 = df[col].quantile(0.75)
        IQR = Q3 - Q1
        # Remove rows where values are outside of the IQR range for the column
        df = df[(df[col] >= (Q1 - 1.5 * IQR)) & (df[col] <= (Q3 + 1.5 * IQR))]
    return df

def cap_extreme_values(df):
    """
    Cap extreme values in the dataset beyond the 1st and 99th percentiles.
    """
    for col in df.select_dtypes(include=['float64', 'int64']).columns:
        upper_limit = df[col].quantile(0.99)
        lower_limit = df[col].quantile(0.01)
        df[col] = df[col].clip(lower=lower_limit, upper=upper_limit)
    return df

def convert_string_to_numeric(df):
    """
    Convert string categorical columns to numeric using Label Encoding.
    """
    label_encoder = LabelEncoder()
    for col in df.select_dtypes(include=['object']).columns:
        df[col] = label_encoder.fit_transform(df[col])
    return df