saherPervaiz commited on
Commit
f2b53e5
·
verified ·
1 Parent(s): df63133

Update utils/data_cleaning.py

Browse files
Files changed (1) hide show
  1. utils/data_cleaning.py +52 -3
utils/data_cleaning.py CHANGED
@@ -1,5 +1,6 @@
1
  import pandas as pd
2
  import numpy as np
 
3
 
4
  def handle_missing_values(df, method='Drop rows'):
5
  """
@@ -16,7 +17,7 @@ def handle_missing_values(df, method='Drop rows'):
16
  df = df.dropna()
17
  elif method == 'Fill with mean/median':
18
  for col in df.columns:
19
- if df[col].dtype in ['int64', 'float64']: # Numeric columns
20
  df[col].fillna(df[col].mean(), inplace=True)
21
  else: # Categorical columns
22
  df[col].fillna(df[col].mode()[0], inplace=True)
@@ -32,7 +33,7 @@ def remove_outliers_iqr(df):
32
  Returns:
33
  - df: The DataFrame after removing outliers.
34
  """
35
- numerical_cols = df.select_dtypes(include=['int64', 'float64']).columns
36
  for col in numerical_cols:
37
  original_count = len(df)
38
  Q1 = df[col].quantile(0.25)
@@ -73,9 +74,57 @@ def convert_string_to_numeric(df, method='Label Encoding'):
73
  Returns:
74
  - df: The DataFrame with string columns converted to numeric values.
75
  """
 
76
  for col in df.select_dtypes(include=['object']).columns:
77
  if method == 'Label Encoding':
78
- df[col] = df[col].astype('category').cat.codes # Label Encoding
79
  elif method == 'One-Hot Encoding':
80
  df = pd.get_dummies(df, columns=[col], drop_first=True) # One-Hot Encoding
81
  return df
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import pandas as pd
2
  import numpy as np
3
+ from sklearn.preprocessing import LabelEncoder
4
 
5
  def handle_missing_values(df, method='Drop rows'):
6
  """
 
17
  df = df.dropna()
18
  elif method == 'Fill with mean/median':
19
  for col in df.columns:
20
+ if df[col].dtype in ['float64', 'int64']: # Numeric columns
21
  df[col].fillna(df[col].mean(), inplace=True)
22
  else: # Categorical columns
23
  df[col].fillna(df[col].mode()[0], inplace=True)
 
33
  Returns:
34
  - df: The DataFrame after removing outliers.
35
  """
36
+ numerical_cols = df.select_dtypes(include=['float64', 'int64']).columns
37
  for col in numerical_cols:
38
  original_count = len(df)
39
  Q1 = df[col].quantile(0.25)
 
74
  Returns:
75
  - df: The DataFrame with string columns converted to numeric values.
76
  """
77
+ label_encoder = LabelEncoder()
78
  for col in df.select_dtypes(include=['object']).columns:
79
  if method == 'Label Encoding':
80
+ df[col] = label_encoder.fit_transform(df[col]) # Label Encoding
81
  elif method == 'One-Hot Encoding':
82
  df = pd.get_dummies(df, columns=[col], drop_first=True) # One-Hot Encoding
83
  return df
84
+
85
+ # Streamlit app logic
86
+ import streamlit as st
87
+
88
+ # File Upload
89
+ uploaded_file = st.file_uploader("Choose a CSV file", type=["csv"])
90
+
91
+ if uploaded_file is not None:
92
+ # Read the uploaded CSV file
93
+ df = pd.read_csv(uploaded_file)
94
+
95
+ # Display the original dataset
96
+ st.write("Original Dataset:")
97
+ st.dataframe(df)
98
+
99
+ # Convert categorical columns to numerical values
100
+ st.write("Converting Categorical Columns to Numerical Values:")
101
+ df = convert_string_to_numeric(df, method='Label Encoding')
102
+
103
+ # Display the dataset after conversion
104
+ st.write("Dataset After Conversion:")
105
+ st.dataframe(df)
106
+
107
+ # Handle missing values
108
+ st.write("Handling Missing (Null) Values:")
109
+ fill_method = st.selectbox("Choose how to handle missing values", ["Drop rows", "Fill with mean/median"])
110
+ df = handle_missing_values(df, method=fill_method)
111
+
112
+ # Display the dataset after handling missing values
113
+ st.write("Dataset After Handling Missing Values:")
114
+ st.dataframe(df)
115
+
116
+ # Remove outliers using the IQR method
117
+ st.write("Removing Outliers Using IQR:")
118
+ df = remove_outliers_iqr(df)
119
+
120
+ # Display the dataset after outlier removal
121
+ st.write("Dataset After Outlier Removal:")
122
+ st.dataframe(df)
123
+
124
+ # Capping extreme values (5th and 95th percentiles)
125
+ st.write("Handling Extreme Values (Capping):")
126
+ df = cap_extreme_values(df)
127
+
128
+ # Display dataset after capping extreme values
129
+ st.write("Dataset After Capping Extreme Values:")
130
+ st.dataframe(df)