trohith89 commited on
Commit
b8f183f
·
verified ·
1 Parent(s): bea525b

Update pages/2_Data_CLeaning_and_Preprocessing.py

Browse files
pages/2_Data_CLeaning_and_Preprocessing.py CHANGED
@@ -4,7 +4,101 @@ import seaborn as sns
4
  import matplotlib.pyplot as plt
5
  from io import StringIO
6
 
 
 
7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8
 
9
 
10
  # Define the URL of the background image (use your own image URL)
@@ -49,108 +143,12 @@ st.markdown(
49
  }}
50
  </style>
51
 
52
- <div class="content-container">
53
- <div class="stMarkdown">
54
- <!-- Replace this with your app's content -->
55
- <p>
56
- # Page Title
57
- st.markdown("<h1 style='text-align:center; color:white;'>Data Cleaning and Processing</h1>", unsafe_allow_html=True)
58
-
59
- # Access dataset from session state
60
- df = st.session_state.get("dataset")
61
-
62
- # Exclude 'ProductID' from the dataset
63
- if df is not None:
64
- df = df.drop(columns=['ProductID'], errors='ignore') # Exclude 'ProductID' if it exists
65
-
66
- st.subheader("Dataset Preview:")
67
- st.write(df.head())
68
-
69
- st.subheader("Info of the Dataset:")
70
- # Redirect the output of df.info() to a string buffer
71
- buffer = StringIO()
72
- df.info(buf=buffer)
73
-
74
- # Display the content in Streamlit
75
- st.write(buffer.getvalue())
76
-
77
- st.subheader("Dataset Description:")
78
- st.write(df.describe())
79
-
80
- st.subheader("Shape of the Dataset:")
81
- st.write(df.shape)
82
-
83
- # Visualize Numeric Data (Histograms and Boxplots in subplots)
84
- numeric_columns = df.select_dtypes(include=['float64', 'int64']).columns
85
- if len(numeric_columns) > 0:
86
- st.subheader("Histograms for Numeric Columns:")
87
- # Create a multidimensional subplot (grid) for all histograms
88
- num_plots = len(numeric_columns)
89
- rows = (num_plots + 1) // 2 # To create a 2-column grid layout for histograms
90
- fig, axs = plt.subplots(rows, 2, figsize=(12, 12))
91
- axs = axs.flatten() # Flatten the 2D array of axes to iterate over
92
-
93
- color_palettes_hist = ['Set1', 'Set2', 'Set3', 'Paired', 'Pastel1'] # Different color palettes for histograms
94
- for i, col in enumerate(numeric_columns):
95
- palette = sns.color_palette(color_palettes_hist[i % len(color_palettes_hist)]) # Ensure different palette for each plot
96
- sns.histplot(df[col], bins=30, kde=True, color=palette[0], ax=axs[i]) # Apply the color palette
97
- axs[i].set_title(f'Histogram of {col}')
98
- st.pyplot(fig)
99
- plt.clf()
100
-
101
- st.subheader("Boxplots for Numeric Columns:")
102
- # Create a multidimensional subplot (grid) for all boxplots
103
- fig, axs = plt.subplots(rows, 2, figsize=(12, 12))
104
- axs = axs.flatten() # Flatten the 2D array of axes to iterate over
105
-
106
- color_palettes_box = ['coolwarm', 'Blues', 'viridis', 'cubehelix', 'crest'] # Different color palettes for boxplots
107
- for i, col in enumerate(numeric_columns):
108
- palette = sns.color_palette(color_palettes_box[i % len(color_palettes_box)]) # Ensure different palette for each plot
109
- sns.boxplot(x=df[col], ax=axs[i], palette=palette)
110
- axs[i].set_title(f'Boxplot of {col}')
111
- st.pyplot(fig)
112
- plt.clf()
113
- else:
114
- st.warning("No numeric columns available for visualization.")
115
-
116
- # Visualize Categorical Data
117
- categorical_columns = df.select_dtypes(include=['object', 'category']).columns
118
- if len(categorical_columns) > 0:
119
- st.subheader("Bar Plots for Categorical Columns:")
120
- selected_cat_col = st.selectbox("Select a Categorical Column", categorical_columns)
121
-
122
- st.write(f"Value Counts for '{selected_cat_col}':")
123
- st.write(df[selected_cat_col].value_counts())
124
-
125
- plt.figure(figsize=(12, 6))
126
- sns.countplot(x=selected_cat_col, data=df, palette='coolwarm') # Unique palette for categorical data
127
- plt.title(f'Bar Plot of {selected_cat_col}')
128
- st.pyplot(plt)
129
- plt.clf()
130
- else:
131
- st.warning("No categorical columns available for visualization.")
132
-
133
- st.subheader("Cleaned Dataset:")
134
- cleaned_data = df.drop_duplicates()
135
- st.write(cleaned_data)
136
-
137
- # Store cleaned data in session state for use in next page
138
- st.session_state.cleaned_data = cleaned_data # Store cleaned data in session state
139
-
140
- # Convert cleaned data to CSV and provide a download button
141
- cleaned_csv = cleaned_data.to_csv(index=False).encode('utf-8')
142
- st.download_button(
143
- label="Download Cleaned Dataset",
144
- data=cleaned_csv,
145
- file_name="cleaned_dataset.csv",
146
- mime="text/csv"
147
- )
148
-
149
- else:
150
- st.warning("No dataset found. Please upload a dataset on the Home page.")
151
- </p>
152
- </div>
153
- </div>
154
  """,
155
  unsafe_allow_html=True
156
  )
 
4
  import matplotlib.pyplot as plt
5
  from io import StringIO
6
 
7
+ # Page Title
8
+ st.markdown("<h1 style='text-align:center; color:white;'>Data Cleaning and Processing</h1>", unsafe_allow_html=True)
9
 
10
+ # Access dataset from session state
11
+ df = st.session_state.get("dataset")
12
+
13
+ # Exclude 'ProductID' from the dataset
14
+ if df is not None:
15
+ df = df.drop(columns=['ProductID'], errors='ignore') # Exclude 'ProductID' if it exists
16
+
17
+ st.subheader("Dataset Preview:")
18
+ st.write(df.head())
19
+
20
+ st.subheader("Info of the Dataset:")
21
+ # Redirect the output of df.info() to a string buffer
22
+ buffer = StringIO()
23
+ df.info(buf=buffer)
24
+
25
+ # Display the content in Streamlit
26
+ st.write(buffer.getvalue())
27
+
28
+ st.subheader("Dataset Description:")
29
+ st.write(df.describe())
30
+
31
+ st.subheader("Shape of the Dataset:")
32
+ st.write(df.shape)
33
+
34
+ # Visualize Numeric Data (Histograms and Boxplots in subplots)
35
+ numeric_columns = df.select_dtypes(include=['float64', 'int64']).columns
36
+ if len(numeric_columns) > 0:
37
+ st.subheader("Histograms for Numeric Columns:")
38
+ # Create a multidimensional subplot (grid) for all histograms
39
+ num_plots = len(numeric_columns)
40
+ rows = (num_plots + 1) // 2 # To create a 2-column grid layout for histograms
41
+ fig, axs = plt.subplots(rows, 2, figsize=(12, 12))
42
+ axs = axs.flatten() # Flatten the 2D array of axes to iterate over
43
+
44
+ color_palettes_hist = ['Set1', 'Set2', 'Set3', 'Paired', 'Pastel1'] # Different color palettes for histograms
45
+ for i, col in enumerate(numeric_columns):
46
+ palette = sns.color_palette(color_palettes_hist[i % len(color_palettes_hist)]) # Ensure different palette for each plot
47
+ sns.histplot(df[col], bins=30, kde=True, color=palette[0], ax=axs[i]) # Apply the color palette
48
+ axs[i].set_title(f'Histogram of {col}')
49
+ st.pyplot(fig)
50
+ plt.clf()
51
+
52
+ st.subheader("Boxplots for Numeric Columns:")
53
+ # Create a multidimensional subplot (grid) for all boxplots
54
+ fig, axs = plt.subplots(rows, 2, figsize=(12, 12))
55
+ axs = axs.flatten() # Flatten the 2D array of axes to iterate over
56
+
57
+ color_palettes_box = ['coolwarm', 'Blues', 'viridis', 'cubehelix', 'crest'] # Different color palettes for boxplots
58
+ for i, col in enumerate(numeric_columns):
59
+ palette = sns.color_palette(color_palettes_box[i % len(color_palettes_box)]) # Ensure different palette for each plot
60
+ sns.boxplot(x=df[col], ax=axs[i], palette=palette)
61
+ axs[i].set_title(f'Boxplot of {col}')
62
+ st.pyplot(fig)
63
+ plt.clf()
64
+ else:
65
+ st.warning("No numeric columns available for visualization.")
66
+
67
+ # Visualize Categorical Data
68
+ categorical_columns = df.select_dtypes(include=['object', 'category']).columns
69
+ if len(categorical_columns) > 0:
70
+ st.subheader("Bar Plots for Categorical Columns:")
71
+ selected_cat_col = st.selectbox("Select a Categorical Column", categorical_columns)
72
+
73
+ st.write(f"Value Counts for '{selected_cat_col}':")
74
+ st.write(df[selected_cat_col].value_counts())
75
+
76
+ plt.figure(figsize=(12, 6))
77
+ sns.countplot(x=selected_cat_col, data=df, palette='coolwarm') # Unique palette for categorical data
78
+ plt.title(f'Bar Plot of {selected_cat_col}')
79
+ st.pyplot(plt)
80
+ plt.clf()
81
+ else:
82
+ st.warning("No categorical columns available for visualization.")
83
+
84
+ st.subheader("Cleaned Dataset:")
85
+ cleaned_data = df.drop_duplicates()
86
+ st.write(cleaned_data)
87
+
88
+ # Store cleaned data in session state for use in next page
89
+ st.session_state.cleaned_data = cleaned_data # Store cleaned data in session state
90
+
91
+ # Convert cleaned data to CSV and provide a download button
92
+ cleaned_csv = cleaned_data.to_csv(index=False).encode('utf-8')
93
+ st.download_button(
94
+ label="Download Cleaned Dataset",
95
+ data=cleaned_csv,
96
+ file_name="cleaned_dataset.csv",
97
+ mime="text/csv"
98
+ )
99
+
100
+ else:
101
+ st.warning("No dataset found. Please upload a dataset on the Home page.")
102
 
103
 
104
  # Define the URL of the background image (use your own image URL)
 
143
  }}
144
  </style>
145
 
146
+ # <div class="content-container">
147
+ # <div class="stMarkdown">
148
+ # <!-- Replace this with your app's content -->
149
+ # <p>Insert your app's content here. All elements will be aligned to the center and limited to 70% width.</p>
150
+ # </div>
151
+ # </div>
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
152
  """,
153
  unsafe_allow_html=True
154
  )