saherPervaiz commited on
Commit
2fecf67
·
verified ·
1 Parent(s): f2b53e5

Update utils/data_cleaning.py

Browse files
Files changed (1) hide show
  1. utils/data_cleaning.py +29 -113
utils/data_cleaning.py CHANGED
@@ -2,129 +2,45 @@ import pandas as pd
2
  import numpy as np
3
  from sklearn.preprocessing import LabelEncoder
4
 
5
- def handle_missing_values(df, method='Drop rows'):
6
- """
7
- Handle missing values in the DataFrame.
8
-
9
- Parameters:
10
- - df: The input DataFrame.
11
- - method: The method to handle missing values: 'Drop rows' or 'Fill with mean/median'.
12
-
13
- Returns:
14
- - df: The DataFrame after handling missing values.
15
- """
16
- if method == 'Drop rows':
17
  df = df.dropna()
18
- elif method == 'Fill with mean/median':
19
  for col in df.columns:
20
- if df[col].dtype in ['float64', 'int64']: # Numeric columns
21
  df[col].fillna(df[col].mean(), inplace=True)
22
- else: # Categorical columns
23
  df[col].fillna(df[col].mode()[0], inplace=True)
24
- return df
25
 
26
- def remove_outliers_iqr(df):
27
- """
28
- Remove outliers using the IQR (Interquartile Range) method for all numerical columns.
29
-
30
- Parameters:
31
- - df: The input DataFrame.
32
-
33
- Returns:
34
- - df: The DataFrame after removing outliers.
35
- """
36
- numerical_cols = df.select_dtypes(include=['float64', 'int64']).columns
37
- for col in numerical_cols:
38
- original_count = len(df)
39
- Q1 = df[col].quantile(0.25)
40
- Q3 = df[col].quantile(0.75)
41
  IQR = Q3 - Q1
42
  lower_bound = Q1 - 1.5 * IQR
43
  upper_bound = Q3 + 1.5 * IQR
44
- df = df[(df[col] >= lower_bound) & (df[col] <= upper_bound)]
45
- removed_rows = original_count - len(df)
46
- print(f"Removed outliers from **{col}**: {removed_rows} rows removed.")
47
- return df
48
 
49
- def cap_extreme_values(df):
50
- """
51
- Cap extreme values (values beyond 99.9th percentile) for each numerical column.
52
-
53
- Parameters:
54
- - df: The input DataFrame.
55
-
56
- Returns:
57
- - df: The DataFrame after capping extreme values.
58
- """
59
- numerical_cols = df.select_dtypes(include=['int64', 'float64']).columns
60
- for col in numerical_cols:
61
- upper_limit = df[col].quantile(0.999)
62
- lower_limit = df[col].quantile(0.001)
63
- df[col] = np.clip(df[col], lower_limit, upper_limit)
64
- return df
65
 
66
- def convert_string_to_numeric(df, method='Label Encoding'):
67
- """
68
- Convert string categorical values to numerical values.
69
-
70
- Parameters:
71
- - df: The input DataFrame.
72
- - method: The method to convert strings to numbers: 'Label Encoding' or 'One-Hot Encoding'.
73
 
74
- Returns:
75
- - df: The DataFrame with string columns converted to numeric values.
76
- """
77
- label_encoder = LabelEncoder()
78
- for col in df.select_dtypes(include=['object']).columns:
79
- if method == 'Label Encoding':
80
- df[col] = label_encoder.fit_transform(df[col]) # Label Encoding
81
- elif method == 'One-Hot Encoding':
82
- df = pd.get_dummies(df, columns=[col], drop_first=True) # One-Hot Encoding
83
- return df
84
-
85
- # Streamlit app logic
86
- import streamlit as st
87
-
88
- # File Upload
89
- uploaded_file = st.file_uploader("Choose a CSV file", type=["csv"])
90
-
91
- if uploaded_file is not None:
92
- # Read the uploaded CSV file
93
- df = pd.read_csv(uploaded_file)
94
-
95
- # Display the original dataset
96
- st.write("Original Dataset:")
97
- st.dataframe(df)
98
-
99
- # Convert categorical columns to numerical values
100
- st.write("Converting Categorical Columns to Numerical Values:")
101
- df = convert_string_to_numeric(df, method='Label Encoding')
102
-
103
- # Display the dataset after conversion
104
- st.write("Dataset After Conversion:")
105
- st.dataframe(df)
106
-
107
- # Handle missing values
108
- st.write("Handling Missing (Null) Values:")
109
- fill_method = st.selectbox("Choose how to handle missing values", ["Drop rows", "Fill with mean/median"])
110
- df = handle_missing_values(df, method=fill_method)
111
-
112
- # Display the dataset after handling missing values
113
- st.write("Dataset After Handling Missing Values:")
114
- st.dataframe(df)
115
-
116
- # Remove outliers using the IQR method
117
- st.write("Removing Outliers Using IQR:")
118
- df = remove_outliers_iqr(df)
119
-
120
- # Display the dataset after outlier removal
121
- st.write("Dataset After Outlier Removal:")
122
- st.dataframe(df)
123
-
124
- # Capping extreme values (5th and 95th percentiles)
125
- st.write("Handling Extreme Values (Capping):")
126
  df = cap_extreme_values(df)
127
 
128
- # Display dataset after capping extreme values
129
- st.write("Dataset After Capping Extreme Values:")
130
- st.dataframe(df)
 
2
  import numpy as np
3
  from sklearn.preprocessing import LabelEncoder
4
 
5
+ def preprocess_data(df):
6
+ # Convert categorical (str) data to numerical
7
+ label_encoder = LabelEncoder()
8
+ for col in df.columns:
9
+ if df[col].dtype == 'object' or len(df[col].unique()) <= 10:
10
+ df[col] = label_encoder.fit_transform(df[col])
11
+
12
+ # Handle missing values
13
+ fill_method = "Fill with mean/median"
14
+ if fill_method == "Drop rows":
 
 
15
  df = df.dropna()
16
+ elif fill_method == "Fill with mean/median":
17
  for col in df.columns:
18
+ if df[col].dtype in ['float64', 'int64']:
19
  df[col].fillna(df[col].mean(), inplace=True)
20
+ else:
21
  df[col].fillna(df[col].mode()[0], inplace=True)
 
22
 
23
+ # Remove outliers using the IQR method
24
+ def remove_outliers_iqr(data, column):
25
+ Q1 = data[column].quantile(0.25)
26
+ Q3 = data[column].quantile(0.75)
 
 
 
 
 
 
 
 
 
 
 
27
  IQR = Q3 - Q1
28
  lower_bound = Q1 - 1.5 * IQR
29
  upper_bound = Q3 + 1.5 * IQR
30
+ return data[(data[column] >= lower_bound) & (data[column] <= upper_bound)]
 
 
 
31
 
32
+ numeric_cols = df.select_dtypes(include=['float64', 'int64']).columns
33
+ for col in numeric_cols:
34
+ df = remove_outliers_iqr(df, col)
 
 
 
 
 
 
 
 
 
 
 
 
 
35
 
36
+ # Capping Extreme Values (based on 5% and 95% percentiles)
37
+ def cap_extreme_values(dataframe):
38
+ for col in dataframe.select_dtypes(include=[np.number]).columns:
39
+ lower_limit = dataframe[col].quantile(0.05)
40
+ upper_limit = dataframe[col].quantile(0.95)
41
+ dataframe[col] = np.clip(dataframe[col], lower_limit, upper_limit)
42
+ return dataframe
43
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
44
  df = cap_extreme_values(df)
45
 
46
+ return df