Spaces:

saherPervaiz
/

Depression

Sleeping

App Files Files Community

saherPervaiz commited on Jan 15, 2025

Commit

f2b53e5

verified ·

1 Parent(s): df63133

Update utils/data_cleaning.py

Browse files

Files changed (1) hide show

utils/data_cleaning.py +52 -3

utils/data_cleaning.py CHANGED Viewed

@@ -1,5 +1,6 @@
 import pandas as pd
 import numpy as np
 def handle_missing_values(df, method='Drop rows'):
     """
@@ -16,7 +17,7 @@ def handle_missing_values(df, method='Drop rows'):
         df = df.dropna()
     elif method == 'Fill with mean/median':
         for col in df.columns:
-            if df[col].dtype in ['int64', 'float64']:  # Numeric columns
                 df[col].fillna(df[col].mean(), inplace=True)
             else:  # Categorical columns
                 df[col].fillna(df[col].mode()[0], inplace=True)
@@ -32,7 +33,7 @@ def remove_outliers_iqr(df):
     Returns:
     - df: The DataFrame after removing outliers.
     """
-    numerical_cols = df.select_dtypes(include=['int64', 'float64']).columns
     for col in numerical_cols:
         original_count = len(df)
         Q1 = df[col].quantile(0.25)
@@ -73,9 +74,57 @@ def convert_string_to_numeric(df, method='Label Encoding'):
     Returns:
     - df: The DataFrame with string columns converted to numeric values.
     """
     for col in df.select_dtypes(include=['object']).columns:
         if method == 'Label Encoding':
-            df[col] = df[col].astype('category').cat.codes  # Label Encoding
         elif method == 'One-Hot Encoding':
             df = pd.get_dummies(df, columns=[col], drop_first=True)  # One-Hot Encoding
     return df

 import pandas as pd
 import numpy as np
+from sklearn.preprocessing import LabelEncoder
 def handle_missing_values(df, method='Drop rows'):
     """
         df = df.dropna()
     elif method == 'Fill with mean/median':
         for col in df.columns:
+            if df[col].dtype in ['float64', 'int64']:  # Numeric columns
                 df[col].fillna(df[col].mean(), inplace=True)
             else:  # Categorical columns
                 df[col].fillna(df[col].mode()[0], inplace=True)
     Returns:
     - df: The DataFrame after removing outliers.
     """
+    numerical_cols = df.select_dtypes(include=['float64', 'int64']).columns
     for col in numerical_cols:
         original_count = len(df)
         Q1 = df[col].quantile(0.25)
     Returns:
     - df: The DataFrame with string columns converted to numeric values.
     """
+    label_encoder = LabelEncoder()
     for col in df.select_dtypes(include=['object']).columns:
         if method == 'Label Encoding':
+            df[col] = label_encoder.fit_transform(df[col])  # Label Encoding
         elif method == 'One-Hot Encoding':
             df = pd.get_dummies(df, columns=[col], drop_first=True)  # One-Hot Encoding
     return df
+# Streamlit app logic
+import streamlit as st
+# File Upload
+uploaded_file = st.file_uploader("Choose a CSV file", type=["csv"])
+if uploaded_file is not None:
+    # Read the uploaded CSV file
+    df = pd.read_csv(uploaded_file)
+    # Display the original dataset
+    st.write("Original Dataset:")
+    st.dataframe(df)
+    # Convert categorical columns to numerical values
+    st.write("Converting Categorical Columns to Numerical Values:")
+    df = convert_string_to_numeric(df, method='Label Encoding')
+    # Display the dataset after conversion
+    st.write("Dataset After Conversion:")
+    st.dataframe(df)
+    # Handle missing values
+    st.write("Handling Missing (Null) Values:")
+    fill_method = st.selectbox("Choose how to handle missing values", ["Drop rows", "Fill with mean/median"])
+    df = handle_missing_values(df, method=fill_method)
+    # Display the dataset after handling missing values
+    st.write("Dataset After Handling Missing Values:")
+    st.dataframe(df)
+    # Remove outliers using the IQR method
+    st.write("Removing Outliers Using IQR:")
+    df = remove_outliers_iqr(df)
+    # Display the dataset after outlier removal
+    st.write("Dataset After Outlier Removal:")
+    st.dataframe(df)
+    # Capping extreme values (5th and 95th percentiles)
+    st.write("Handling Extreme Values (Capping):")
+    df = cap_extreme_values(df)
+    # Display dataset after capping extreme values
+    st.write("Dataset After Capping Extreme Values:")
+    st.dataframe(df)