trohith89 commited on
Commit
bea525b
·
verified ·
1 Parent(s): 7723572

Update pages/2_Data_CLeaning_and_Preprocessing.py

Browse files
pages/2_Data_CLeaning_and_Preprocessing.py CHANGED
@@ -4,101 +4,7 @@ import seaborn as sns
4
  import matplotlib.pyplot as plt
5
  from io import StringIO
6
 
7
- # Page Title
8
- st.markdown("<h1 style='text-align:center; color:white;'>Data Cleaning and Processing</h1>", unsafe_allow_html=True)
9
 
10
- # Access dataset from session state
11
- df = st.session_state.get("dataset")
12
-
13
- # Exclude 'ProductID' from the dataset
14
- if df is not None:
15
- df = df.drop(columns=['ProductID'], errors='ignore') # Exclude 'ProductID' if it exists
16
-
17
- st.subheader("Dataset Preview:")
18
- st.write(df.head())
19
-
20
- st.subheader("Info of the Dataset:")
21
- # Redirect the output of df.info() to a string buffer
22
- buffer = StringIO()
23
- df.info(buf=buffer)
24
-
25
- # Display the content in Streamlit
26
- st.write(buffer.getvalue())
27
-
28
- st.subheader("Dataset Description:")
29
- st.write(df.describe())
30
-
31
- st.subheader("Shape of the Dataset:")
32
- st.write(df.shape)
33
-
34
- # Visualize Numeric Data (Histograms and Boxplots in subplots)
35
- numeric_columns = df.select_dtypes(include=['float64', 'int64']).columns
36
- if len(numeric_columns) > 0:
37
- st.subheader("Histograms for Numeric Columns:")
38
- # Create a multidimensional subplot (grid) for all histograms
39
- num_plots = len(numeric_columns)
40
- rows = (num_plots + 1) // 2 # To create a 2-column grid layout for histograms
41
- fig, axs = plt.subplots(rows, 2, figsize=(12, 12))
42
- axs = axs.flatten() # Flatten the 2D array of axes to iterate over
43
-
44
- color_palettes_hist = ['Set1', 'Set2', 'Set3', 'Paired', 'Pastel1'] # Different color palettes for histograms
45
- for i, col in enumerate(numeric_columns):
46
- palette = sns.color_palette(color_palettes_hist[i % len(color_palettes_hist)]) # Ensure different palette for each plot
47
- sns.histplot(df[col], bins=30, kde=True, color=palette[0], ax=axs[i]) # Apply the color palette
48
- axs[i].set_title(f'Histogram of {col}')
49
- st.pyplot(fig)
50
- plt.clf()
51
-
52
- st.subheader("Boxplots for Numeric Columns:")
53
- # Create a multidimensional subplot (grid) for all boxplots
54
- fig, axs = plt.subplots(rows, 2, figsize=(12, 12))
55
- axs = axs.flatten() # Flatten the 2D array of axes to iterate over
56
-
57
- color_palettes_box = ['coolwarm', 'Blues', 'viridis', 'cubehelix', 'crest'] # Different color palettes for boxplots
58
- for i, col in enumerate(numeric_columns):
59
- palette = sns.color_palette(color_palettes_box[i % len(color_palettes_box)]) # Ensure different palette for each plot
60
- sns.boxplot(x=df[col], ax=axs[i], palette=palette)
61
- axs[i].set_title(f'Boxplot of {col}')
62
- st.pyplot(fig)
63
- plt.clf()
64
- else:
65
- st.warning("No numeric columns available for visualization.")
66
-
67
- # Visualize Categorical Data
68
- categorical_columns = df.select_dtypes(include=['object', 'category']).columns
69
- if len(categorical_columns) > 0:
70
- st.subheader("Bar Plots for Categorical Columns:")
71
- selected_cat_col = st.selectbox("Select a Categorical Column", categorical_columns)
72
-
73
- st.write(f"Value Counts for '{selected_cat_col}':")
74
- st.write(df[selected_cat_col].value_counts())
75
-
76
- plt.figure(figsize=(12, 6))
77
- sns.countplot(x=selected_cat_col, data=df, palette='coolwarm') # Unique palette for categorical data
78
- plt.title(f'Bar Plot of {selected_cat_col}')
79
- st.pyplot(plt)
80
- plt.clf()
81
- else:
82
- st.warning("No categorical columns available for visualization.")
83
-
84
- st.subheader("Cleaned Dataset:")
85
- cleaned_data = df.drop_duplicates()
86
- st.write(cleaned_data)
87
-
88
- # Store cleaned data in session state for use in next page
89
- st.session_state.cleaned_data = cleaned_data # Store cleaned data in session state
90
-
91
- # Convert cleaned data to CSV and provide a download button
92
- cleaned_csv = cleaned_data.to_csv(index=False).encode('utf-8')
93
- st.download_button(
94
- label="Download Cleaned Dataset",
95
- data=cleaned_csv,
96
- file_name="cleaned_dataset.csv",
97
- mime="text/csv"
98
- )
99
-
100
- else:
101
- st.warning("No dataset found. Please upload a dataset on the Home page.")
102
 
103
 
104
  # Define the URL of the background image (use your own image URL)
@@ -146,7 +52,103 @@ st.markdown(
146
  <div class="content-container">
147
  <div class="stMarkdown">
148
  <!-- Replace this with your app's content -->
149
- <p>Insert your app's content here. All elements will be aligned to the center and limited to 70% width.</p>
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
150
  </div>
151
  </div>
152
  """,
 
4
  import matplotlib.pyplot as plt
5
  from io import StringIO
6
 
 
 
7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8
 
9
 
10
  # Define the URL of the background image (use your own image URL)
 
52
  <div class="content-container">
53
  <div class="stMarkdown">
54
  <!-- Replace this with your app's content -->
55
+ <p>
56
+ # Page Title
57
+ st.markdown("<h1 style='text-align:center; color:white;'>Data Cleaning and Processing</h1>", unsafe_allow_html=True)
58
+
59
+ # Access dataset from session state
60
+ df = st.session_state.get("dataset")
61
+
62
+ # Exclude 'ProductID' from the dataset
63
+ if df is not None:
64
+ df = df.drop(columns=['ProductID'], errors='ignore') # Exclude 'ProductID' if it exists
65
+
66
+ st.subheader("Dataset Preview:")
67
+ st.write(df.head())
68
+
69
+ st.subheader("Info of the Dataset:")
70
+ # Redirect the output of df.info() to a string buffer
71
+ buffer = StringIO()
72
+ df.info(buf=buffer)
73
+
74
+ # Display the content in Streamlit
75
+ st.write(buffer.getvalue())
76
+
77
+ st.subheader("Dataset Description:")
78
+ st.write(df.describe())
79
+
80
+ st.subheader("Shape of the Dataset:")
81
+ st.write(df.shape)
82
+
83
+ # Visualize Numeric Data (Histograms and Boxplots in subplots)
84
+ numeric_columns = df.select_dtypes(include=['float64', 'int64']).columns
85
+ if len(numeric_columns) > 0:
86
+ st.subheader("Histograms for Numeric Columns:")
87
+ # Create a multidimensional subplot (grid) for all histograms
88
+ num_plots = len(numeric_columns)
89
+ rows = (num_plots + 1) // 2 # To create a 2-column grid layout for histograms
90
+ fig, axs = plt.subplots(rows, 2, figsize=(12, 12))
91
+ axs = axs.flatten() # Flatten the 2D array of axes to iterate over
92
+
93
+ color_palettes_hist = ['Set1', 'Set2', 'Set3', 'Paired', 'Pastel1'] # Different color palettes for histograms
94
+ for i, col in enumerate(numeric_columns):
95
+ palette = sns.color_palette(color_palettes_hist[i % len(color_palettes_hist)]) # Ensure different palette for each plot
96
+ sns.histplot(df[col], bins=30, kde=True, color=palette[0], ax=axs[i]) # Apply the color palette
97
+ axs[i].set_title(f'Histogram of {col}')
98
+ st.pyplot(fig)
99
+ plt.clf()
100
+
101
+ st.subheader("Boxplots for Numeric Columns:")
102
+ # Create a multidimensional subplot (grid) for all boxplots
103
+ fig, axs = plt.subplots(rows, 2, figsize=(12, 12))
104
+ axs = axs.flatten() # Flatten the 2D array of axes to iterate over
105
+
106
+ color_palettes_box = ['coolwarm', 'Blues', 'viridis', 'cubehelix', 'crest'] # Different color palettes for boxplots
107
+ for i, col in enumerate(numeric_columns):
108
+ palette = sns.color_palette(color_palettes_box[i % len(color_palettes_box)]) # Ensure different palette for each plot
109
+ sns.boxplot(x=df[col], ax=axs[i], palette=palette)
110
+ axs[i].set_title(f'Boxplot of {col}')
111
+ st.pyplot(fig)
112
+ plt.clf()
113
+ else:
114
+ st.warning("No numeric columns available for visualization.")
115
+
116
+ # Visualize Categorical Data
117
+ categorical_columns = df.select_dtypes(include=['object', 'category']).columns
118
+ if len(categorical_columns) > 0:
119
+ st.subheader("Bar Plots for Categorical Columns:")
120
+ selected_cat_col = st.selectbox("Select a Categorical Column", categorical_columns)
121
+
122
+ st.write(f"Value Counts for '{selected_cat_col}':")
123
+ st.write(df[selected_cat_col].value_counts())
124
+
125
+ plt.figure(figsize=(12, 6))
126
+ sns.countplot(x=selected_cat_col, data=df, palette='coolwarm') # Unique palette for categorical data
127
+ plt.title(f'Bar Plot of {selected_cat_col}')
128
+ st.pyplot(plt)
129
+ plt.clf()
130
+ else:
131
+ st.warning("No categorical columns available for visualization.")
132
+
133
+ st.subheader("Cleaned Dataset:")
134
+ cleaned_data = df.drop_duplicates()
135
+ st.write(cleaned_data)
136
+
137
+ # Store cleaned data in session state for use in next page
138
+ st.session_state.cleaned_data = cleaned_data # Store cleaned data in session state
139
+
140
+ # Convert cleaned data to CSV and provide a download button
141
+ cleaned_csv = cleaned_data.to_csv(index=False).encode('utf-8')
142
+ st.download_button(
143
+ label="Download Cleaned Dataset",
144
+ data=cleaned_csv,
145
+ file_name="cleaned_dataset.csv",
146
+ mime="text/csv"
147
+ )
148
+
149
+ else:
150
+ st.warning("No dataset found. Please upload a dataset on the Home page.")
151
+ </p>
152
  </div>
153
  </div>
154
  """,