Spaces:

saherPervaiz
/

Depression

Sleeping

App Files Files Community

saherPervaiz commited on Jan 15, 2025

Commit

2fecf67

verified ·

1 Parent(s): f2b53e5

Update utils/data_cleaning.py

Browse files

Files changed (1) hide show

utils/data_cleaning.py +29 -113

utils/data_cleaning.py CHANGED Viewed

@@ -2,129 +2,45 @@ import pandas as pd
 import numpy as np
 from sklearn.preprocessing import LabelEncoder
-def handle_missing_values(df, method='Drop rows'):
-    """
-    Handle missing values in the DataFrame.
-    Parameters:
-    - df: The input DataFrame.
-    - method: The method to handle missing values: 'Drop rows' or 'Fill with mean/median'.
-    Returns:
-    - df: The DataFrame after handling missing values.
-    """
-    if method == 'Drop rows':
         df = df.dropna()
-    elif method == 'Fill with mean/median':
         for col in df.columns:
-            if df[col].dtype in ['float64', 'int64']:  # Numeric columns
                 df[col].fillna(df[col].mean(), inplace=True)
-            else:  # Categorical columns
                 df[col].fillna(df[col].mode()[0], inplace=True)
-    return df
-def remove_outliers_iqr(df):
-    """
-    Remove outliers using the IQR (Interquartile Range) method for all numerical columns.
-    Parameters:
-    - df: The input DataFrame.
-    Returns:
-    - df: The DataFrame after removing outliers.
-    """
-    numerical_cols = df.select_dtypes(include=['float64', 'int64']).columns
-    for col in numerical_cols:
-        original_count = len(df)
-        Q1 = df[col].quantile(0.25)
-        Q3 = df[col].quantile(0.75)
         IQR = Q3 - Q1
         lower_bound = Q1 - 1.5 * IQR
         upper_bound = Q3 + 1.5 * IQR
-        df = df[(df[col] >= lower_bound) & (df[col] <= upper_bound)]
-        removed_rows = original_count - len(df)
-        print(f"Removed outliers from **{col}**: {removed_rows} rows removed.")
-    return df
-def cap_extreme_values(df):
-    """
-    Cap extreme values (values beyond 99.9th percentile) for each numerical column.
-    Parameters:
-    - df: The input DataFrame.
-    Returns:
-    - df: The DataFrame after capping extreme values.
-    """
-    numerical_cols = df.select_dtypes(include=['int64', 'float64']).columns
-    for col in numerical_cols:
-        upper_limit = df[col].quantile(0.999)
-        lower_limit = df[col].quantile(0.001)
-        df[col] = np.clip(df[col], lower_limit, upper_limit)
-    return df
-def convert_string_to_numeric(df, method='Label Encoding'):
-    """
-    Convert string categorical values to numerical values.
-    Parameters:
-    - df: The input DataFrame.
-    - method: The method to convert strings to numbers: 'Label Encoding' or 'One-Hot Encoding'.
-    Returns:
-    - df: The DataFrame with string columns converted to numeric values.
-    """
-    label_encoder = LabelEncoder()
-    for col in df.select_dtypes(include=['object']).columns:
-        if method == 'Label Encoding':
-            df[col] = label_encoder.fit_transform(df[col])  # Label Encoding
-        elif method == 'One-Hot Encoding':
-            df = pd.get_dummies(df, columns=[col], drop_first=True)  # One-Hot Encoding
-    return df
-# Streamlit app logic
-import streamlit as st
-# File Upload
-uploaded_file = st.file_uploader("Choose a CSV file", type=["csv"])
-if uploaded_file is not None:
-    # Read the uploaded CSV file
-    df = pd.read_csv(uploaded_file)
-    # Display the original dataset
-    st.write("Original Dataset:")
-    st.dataframe(df)
-    # Convert categorical columns to numerical values
-    st.write("Converting Categorical Columns to Numerical Values:")
-    df = convert_string_to_numeric(df, method='Label Encoding')
-    # Display the dataset after conversion
-    st.write("Dataset After Conversion:")
-    st.dataframe(df)
-    # Handle missing values
-    st.write("Handling Missing (Null) Values:")
-    fill_method = st.selectbox("Choose how to handle missing values", ["Drop rows", "Fill with mean/median"])
-    df = handle_missing_values(df, method=fill_method)
-    # Display the dataset after handling missing values
-    st.write("Dataset After Handling Missing Values:")
-    st.dataframe(df)
-    # Remove outliers using the IQR method
-    st.write("Removing Outliers Using IQR:")
-    df = remove_outliers_iqr(df)
-    # Display the dataset after outlier removal
-    st.write("Dataset After Outlier Removal:")
-    st.dataframe(df)
-    # Capping extreme values (5th and 95th percentiles)
-    st.write("Handling Extreme Values (Capping):")
     df = cap_extreme_values(df)
-    # Display dataset after capping extreme values
-    st.write("Dataset After Capping Extreme Values:")
-    st.dataframe(df)

 import numpy as np
 from sklearn.preprocessing import LabelEncoder
+def preprocess_data(df):
+    # Convert categorical (str) data to numerical
+    label_encoder = LabelEncoder()
+    for col in df.columns:
+        if df[col].dtype == 'object' or len(df[col].unique()) <= 10:
+            df[col] = label_encoder.fit_transform(df[col])
+    # Handle missing values
+    fill_method = "Fill with mean/median"
+    if fill_method == "Drop rows":
         df = df.dropna()
+    elif fill_method == "Fill with mean/median":
         for col in df.columns:
+            if df[col].dtype in ['float64', 'int64']:
                 df[col].fillna(df[col].mean(), inplace=True)
+            else:
                 df[col].fillna(df[col].mode()[0], inplace=True)
+    # Remove outliers using the IQR method
+    def remove_outliers_iqr(data, column):
+        Q1 = data[column].quantile(0.25)
+        Q3 = data[column].quantile(0.75)
         IQR = Q3 - Q1
         lower_bound = Q1 - 1.5 * IQR
         upper_bound = Q3 + 1.5 * IQR
+        return data[(data[column] >= lower_bound) & (data[column] <= upper_bound)]
+    numeric_cols = df.select_dtypes(include=['float64', 'int64']).columns
+    for col in numeric_cols:
+        df = remove_outliers_iqr(df, col)
+    # Capping Extreme Values (based on 5% and 95% percentiles)
+    def cap_extreme_values(dataframe):
+        for col in dataframe.select_dtypes(include=[np.number]).columns:
+            lower_limit = dataframe[col].quantile(0.05)
+            upper_limit = dataframe[col].quantile(0.95)
+            dataframe[col] = np.clip(dataframe[col], lower_limit, upper_limit)
+        return dataframe
     df = cap_extreme_values(df)
+    return df