Gowthamvemula commited on
Commit
6f508e2
·
verified ·
1 Parent(s): a1a20e6

Create 2_Data_CLeaning_and_Preprocessing.py

Browse files
pages/2_Data_CLeaning_and_Preprocessing.py ADDED
@@ -0,0 +1,177 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import pandas as pd
3
+ import seaborn as sns
4
+ import matplotlib.pyplot as plt
5
+ from io import StringIO
6
+
7
+ # Page Title
8
+ st.markdown("<h1 style='text-align:center; color:white;'>Data Cleaning and Processing</h1>", unsafe_allow_html=True)
9
+
10
+ # Access dataset from session state
11
+ df = st.session_state.get("dataset")
12
+
13
+ # Exclude 'ProductID' from the dataset
14
+ if df is not None:
15
+ df = df.drop(columns=['ProductID'], errors='ignore') # Exclude 'ProductID' if it exists
16
+
17
+ st.subheader("Dataset Preview:")
18
+ st.write(df.head())
19
+
20
+ st.subheader("Info of the Dataset:")
21
+ # Redirect the output of df.info() to a string buffer
22
+ buffer = StringIO()
23
+ df.info(buf=buffer)
24
+
25
+ # Display the content in Streamlit
26
+ st.write(buffer.getvalue())
27
+
28
+ st.subheader("Dataset Description:")
29
+ st.write(df.describe())
30
+
31
+ st.subheader("Shape of the Dataset:")
32
+ st.write(df.shape)
33
+
34
+ st.markdown("### Import Necessary Libraries:")
35
+ st.code("""
36
+ import numpy as np
37
+ import pandas as pd
38
+ import matplotlib.pyplot as plt
39
+ import seaborn as sns
40
+ import plotly.express as px
41
+ import warnings
42
+ warnings.filterwarnings('ignore')
43
+
44
+ from sklearn.linear_model import LogisticRegression
45
+ from sklearn.neighbors import KNeighborsClassifier
46
+ from sklearn.model_selection import train_test_split, cross_val_score
47
+ from sklearn.preprocessing import StandardScaler, LabelEncoder
48
+ from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, log_loss
49
+
50
+ import optuna
51
+ import imblearn
52
+ from imblearn.under_sampling import RandomUnderSampler
53
+ from imblearn.over_sampling import RandomOverSampler, SMOTE
54
+
55
+ import pickle
56
+ """, language="python")
57
+
58
+ # Visualize Numeric Data (Histograms and Boxplots in subplots)
59
+ numeric_columns = df.select_dtypes(include=['float64', 'int64']).columns
60
+ if len(numeric_columns) > 0:
61
+ st.subheader("Histograms for Numeric Columns:")
62
+ # Create a multidimensional subplot (grid) for all histograms
63
+ num_plots = len(numeric_columns)
64
+ rows = (num_plots + 1) // 2 # To create a 2-column grid layout for histograms
65
+ fig, axs = plt.subplots(rows, 2, figsize=(12, 12))
66
+ axs = axs.flatten() # Flatten the 2D array of axes to iterate over
67
+
68
+ color_palettes_hist = ['Set1', 'Set2', 'Set3', 'Paired', 'Pastel1'] # Different color palettes for histograms
69
+ for i, col in enumerate(numeric_columns):
70
+ palette = sns.color_palette(color_palettes_hist[i % len(color_palettes_hist)]) # Ensure different palette for each plot
71
+ sns.histplot(df[col], bins=30, kde=True, color=palette[0], ax=axs[i]) # Apply the color palette
72
+ axs[i].set_title(f'Histogram of {col}')
73
+ st.pyplot(fig)
74
+ plt.clf()
75
+
76
+ st.subheader("Boxplots for Numeric Columns:")
77
+ # Create a multidimensional subplot (grid) for all boxplots
78
+ fig, axs = plt.subplots(rows, 2, figsize=(12, 12))
79
+ axs = axs.flatten() # Flatten the 2D array of axes to iterate over
80
+
81
+ color_palettes_box = ['coolwarm', 'Blues', 'viridis', 'cubehelix', 'crest'] # Different color palettes for boxplots
82
+ for i, col in enumerate(numeric_columns):
83
+ palette = sns.color_palette(color_palettes_box[i % len(color_palettes_box)]) # Ensure different palette for each plot
84
+ sns.boxplot(x=df[col], ax=axs[i], palette=palette)
85
+ axs[i].set_title(f'Boxplot of {col}')
86
+ st.pyplot(fig)
87
+ plt.clf()
88
+ else:
89
+ st.warning("No numeric columns available for visualization.")
90
+
91
+ # Visualize Categorical Data
92
+ categorical_columns = df.select_dtypes(include=['object', 'category']).columns
93
+ if len(categorical_columns) > 0:
94
+ st.subheader("Bar Plots for Categorical Columns:")
95
+ selected_cat_col = st.selectbox("Select a Categorical Column", categorical_columns)
96
+
97
+ st.write(f"Value Counts for '{selected_cat_col}':")
98
+ st.write(df[selected_cat_col].value_counts())
99
+
100
+ plt.figure(figsize=(12, 6))
101
+ sns.countplot(x=selected_cat_col, data=df, palette='coolwarm') # Unique palette for categorical data
102
+ plt.title(f'Bar Plot of {selected_cat_col}')
103
+ st.pyplot(plt)
104
+ plt.clf()
105
+ else:
106
+ st.warning("No categorical columns available for visualization.")
107
+
108
+ st.subheader("Cleaned Dataset:")
109
+ cleaned_data = df.drop_duplicates()
110
+ st.write(cleaned_data)
111
+
112
+ # Store cleaned data in session state for use in next page
113
+ st.session_state.cleaned_data = cleaned_data # Store cleaned data in session state
114
+
115
+ # Convert cleaned data to CSV and provide a download button
116
+ cleaned_csv = cleaned_data.to_csv(index=False).encode('utf-8')
117
+ st.download_button(
118
+ label="Download Cleaned Dataset",
119
+ data=cleaned_csv,
120
+ file_name="cleaned_dataset.csv",
121
+ mime="text/csv"
122
+ )
123
+
124
+ else:
125
+ st.warning("No dataset found. Please upload a dataset on the Home page.")
126
+
127
+
128
+ # Define the URL of the background image (use your own image URL)
129
+ # Apply custom CSS for the background image and overlay
130
+ background_image_url = "https://cdn-uploads.huggingface.co/production/uploads/67441c51a784a9d15cb12871/JUxgk4Z7jvSNM7OnB4nOw.jpeg"
131
+
132
+ st.markdown(
133
+ f"""
134
+ <style>
135
+ .stApp {{
136
+ background-image: url("{background_image_url}");
137
+ background-size: auto; /* Ensures the image retains its original size */
138
+ background-repeat: repeat; /* Makes the image repeat to cover the entire background */
139
+ background-position: top left; /* Starts repeating from the top-left corner */
140
+ background-attachment: fixed; /* Keeps the background fixed as you scroll */
141
+ }}
142
+
143
+ /* Semi-transparent overlay */
144
+ .stApp::before {{
145
+ content: "";
146
+ position: absolute;
147
+ top: 0;
148
+ left: 0;
149
+ width: 100%;
150
+ height: 100%;
151
+ background: rgba(0, 0, 0, 0.4); /* Adjust transparency here (0.4 for 40% transparency) */
152
+ z-index: -1;
153
+ }}
154
+
155
+ /* Container to center elements and limit width */
156
+ .content-container {{
157
+ max-width: 70%; /* Limit content width to 70% */
158
+ margin: 0 auto; /* Center the container horizontally */
159
+ padding: 50px; /* Add padding for spacing */
160
+ }}
161
+
162
+ /* Styling the markdown content */
163
+ .stMarkdown {{
164
+ color: white; /* White text for better visibility */
165
+ font-size: 100px; /* Adjust font size for readability */
166
+ }}
167
+ </style>
168
+ """,
169
+ unsafe_allow_html=True
170
+ )
171
+
172
+
173
+
174
+ if st.button("Previous ⏮️"):
175
+ st.switch_page("pages/1_Data_Card_and_Data_collection.py")
176
+ if st.button("Next ⏭️"):
177
+ st.switch_page("pages/3_EDA_and_Feature_Engineering.py")