DhominickJ commited on
Commit
ee40e2b
·
1 Parent(s): 134db2b

Initial Commit for the Mall Customers Prediciton

Browse files
Student-Employability-Datasets(Data).csv ADDED
The diff for this file is too large to render. See raw diff
 
app.py ADDED
@@ -0,0 +1,293 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import pandas as pd
3
+ import numpy as np
4
+ from sklearn.preprocessing import StandardScaler, MinMaxScaler
5
+ from sklearn.cluster import KMeans, DBSCAN, AgglomerativeClustering, Birch, MeanShift
6
+ from sklearn.metrics import silhouette_score, calinski_harabasz_score
7
+ import matplotlib.pyplot as plt
8
+ import seaborn as sns
9
+ import base64
10
+
11
+ # Function to load and preprocess the data
12
+ def load_data(file_path):
13
+ """
14
+ Load and preprocess the dataset from a CSV file.
15
+
16
+ Parameters:
17
+ - file_path: str, path to the CSV file
18
+
19
+ Returns:
20
+ - df: DataFrame, preprocessed dataset
21
+ """
22
+ try:
23
+ df = pd.read_csv(file_path)
24
+ # Drop the 'Name of Student' column as it is not numerical
25
+ df = df.drop(columns=['Name of Student'])
26
+ # Convert categorical 'CLASS' to numerical
27
+ df['CLASS'] = df['CLASS'].astype('category').cat.codes
28
+ return df
29
+ except Exception as e:
30
+ st.error(f"Error loading data: {e}")
31
+ return None
32
+
33
+ # Function to scale and normalize the data
34
+ def scale_normalize_data(df):
35
+ """
36
+ Scale and normalize the dataset.
37
+
38
+ Parameters:
39
+ - df: DataFrame, dataset to be scaled and normalized
40
+
41
+ Returns:
42
+ - scaled_df: DataFrame, scaled and normalized dataset
43
+ """
44
+ scaler = StandardScaler()
45
+ # Drop 'Cluster' column if it exists
46
+ if 'Cluster' in df.columns:
47
+ df = df.drop(columns=['Cluster'])
48
+ scaled_df = pd.DataFrame(scaler.fit_transform(df), columns=df.columns)
49
+ return scaled_df
50
+
51
+ # Function to create a scatter plot
52
+ def create_scatter_plot(df, x_col, y_col, cluster_labels):
53
+ """
54
+ Create a scatter plot for visualization.
55
+
56
+ Parameters:
57
+ - df: DataFrame, dataset
58
+ - x_col: str, column for x-axis
59
+ - y_col: str, column for y-axis
60
+ - cluster_labels: array, cluster labels
61
+ """
62
+ plt.figure(figsize=(10, 6))
63
+ sns.scatterplot(x=x_col, y=y_col, hue=cluster_labels, data=df, palette='viridis')
64
+ plt.title(f'Scatter Plot of {x_col} vs {y_col}')
65
+ st.pyplot(plt)
66
+
67
+ # Function to create an elbow curve
68
+ def create_elbow_curve(df, max_clusters):
69
+ """
70
+ Create an elbow curve to determine the optimal number of clusters.
71
+
72
+ Parameters:
73
+ - df: DataFrame, dataset
74
+ - max_clusters: int, maximum number of clusters to consider
75
+ """
76
+ wcss = []
77
+ for i in range(1, max_clusters + 1):
78
+ kmeans = KMeans(n_clusters=i, random_state=42)
79
+ kmeans.fit(df)
80
+ wcss.append(kmeans.inertia_)
81
+ plt.figure(figsize=(10, 6))
82
+ plt.plot(range(1, max_clusters + 1), wcss, marker='o')
83
+ plt.title('Elbow Curve')
84
+ plt.xlabel('Number of Clusters')
85
+ plt.ylabel('WCSS')
86
+ st.pyplot(plt)
87
+
88
+ # Function to perform clustering and display results
89
+ def perform_clustering(df, algorithm, params):
90
+ """
91
+ Perform clustering using the specified algorithm and parameters.
92
+
93
+ Parameters:
94
+ - df: DataFrame, dataset
95
+ - algorithm: str, clustering algorithm ('kmeans', 'dbscan', 'agglomerative', 'birch', 'meanshift')
96
+ - params: dict, parameters for the algorithm
97
+
98
+ Returns:
99
+ - model: fitted clustering model
100
+ """
101
+ if algorithm == 'kmeans':
102
+ model = KMeans(n_clusters=params['n_clusters'], random_state=42)
103
+ elif algorithm == 'dbscan':
104
+ model = DBSCAN(eps=params['eps'], min_samples=params['min_samples'])
105
+ elif algorithm == 'agglomerative':
106
+ model = AgglomerativeClustering(n_clusters=params['n_clusters'])
107
+ elif algorithm == 'birch':
108
+ model = Birch(n_clusters=params['n_clusters'])
109
+ elif algorithm == 'meanshift':
110
+ model = MeanShift(bandwidth=params['bandwidth'])
111
+ else:
112
+ st.error("Invalid algorithm")
113
+ return None
114
+
115
+ cluster_labels = model.fit_predict(df)
116
+ df['Cluster'] = cluster_labels
117
+ st.write("Cluster Assignments:")
118
+ st.dataframe(df)
119
+
120
+
121
+ # Create elbow curve if applicable
122
+ if algorithm == 'kmeans' and 'max_clusters' in params:
123
+ create_elbow_curve(df, params['max_clusters'])
124
+
125
+ return cluster_labels
126
+
127
+ def display_performance_metrics(df, cluster_labels):
128
+ """
129
+ Display performance metrics for clustering results.
130
+
131
+ Parameters:
132
+ - df: DataFrame, dataset
133
+ - cluster_labels: array, cluster labels
134
+ """
135
+ if len(np.unique(cluster_labels)) > 1:
136
+ silhouette = silhouette_score(df, cluster_labels)
137
+ calinski_harabasz = calinski_harabasz_score(df, cluster_labels)
138
+ st.write(f"Silhouette Score: {silhouette:.2f}")
139
+ st.write(f"Calinski-Harabasz Score: {calinski_harabasz:.2f}")
140
+
141
+
142
+
143
+ # Function to allow users to input new data points
144
+ def input_new_data(df):
145
+ """
146
+ Allow users to input new data points for prediction.
147
+
148
+ Parameters:
149
+ - df: DataFrame, dataset
150
+ """
151
+ st.sidebar.write("Input new data for prediction:")
152
+ new_data = {}
153
+ for col in df.columns:
154
+ if col != 'Cluster':
155
+ new_data[col] = st.sidebar.slider(f"Enter {col}", 1, 5)
156
+ new_df = pd.DataFrame([new_data])
157
+ scaled_new_df = scale_normalize_data(new_df)
158
+ return scaled_new_df
159
+
160
+ # Function to download results
161
+ def download_results(df):
162
+ """
163
+ Provide a downloadable CSV file of the results.
164
+
165
+ Parameters:
166
+ - df: DataFrame, results to be downloaded
167
+ """
168
+ csv = df.to_csv(index=False)
169
+ b64 = base64.b64encode(csv.encode()).decode()
170
+ href = f'<a href="data:file/csv;base64,{b64}" download="cluster_results.csv">Download CSV File</a>'
171
+ st.markdown(href, unsafe_allow_html=True)
172
+
173
+ # Main function to create the Streamlit app
174
+ def main():
175
+ st.title("Unsupervised Learning on Student Performance Data")
176
+ st.write("This application implements five unsupervised learning algorithms on a dataset of student performance. The algorithms include K-Means, DBSCAN, Agglomerative Clustering, Birch, and Mean Shift. The application provides interactive visualizations, parameter tuning, and performance metrics.")
177
+
178
+ # Load and preprocess the data
179
+ file_path = './Student-Employability-Datasets(Data).csv'
180
+ df = load_data(file_path)
181
+ if df is not None:
182
+ st.write("Preprocessed Data:")
183
+ st.dataframe(df)
184
+
185
+ # Scale and normalize the data
186
+ df_for_scaling = df.drop(columns=['CLASS'])
187
+ scaled_df = scale_normalize_data(df_for_scaling)
188
+ st.write("Scaled and Normalized Data:")
189
+ st.dataframe(scaled_df)
190
+
191
+ # Feature correlation analysis
192
+ st.write("Feature Correlation Analysis:")
193
+ # Exclude 'CLASS' and 'Cluster' columns from correlation analysis
194
+ numeric_cols = df.select_dtypes(include=[np.number]).columns
195
+ numeric_cols = [col for col in numeric_cols if col not in ['CLASS', 'Cluster']]
196
+ corr_matrix = df[numeric_cols].corr()
197
+ st.write(corr_matrix)
198
+ plt.figure(figsize=(10, 8))
199
+ sns.heatmap(corr_matrix, annot=True, cmap='coolwarm')
200
+ st.pyplot(plt)
201
+
202
+ # Create a radio button for algorithm selection
203
+ st.sidebar.header('Algorithms')
204
+ selected_algorithm = st.sidebar.radio("Select Algorithm", ["K-Means", "DBSCAN", "Agglomerative Clustering", "Birch", "Mean Shift"])
205
+ # Show parameters based on selected algorithm
206
+ st.sidebar.header('Parameters')
207
+
208
+ st.title("Algorithms Tab")
209
+ st.write("Choose the algorithm above first so that the options will show about the algorithm of choice! :)")
210
+ # Create tabs for each algorithm
211
+ tab1, tab2, tab3, tab4, tab5 = st.tabs(["K-Means", "DBSCAN", "Agglomerative Clustering", "Birch", "Mean Shift"])
212
+
213
+ with tab1:
214
+ st.header("K-Means Clustering")
215
+ if selected_algorithm == "K-Means":
216
+ n_clusters = st.sidebar.slider("Number of Clusters", 2, 10, 3, key='n_clusters')
217
+ max_clusters = st.sidebar.slider("Maximum Number of Clusters for Elbow Curve", 2, 15, 10, key='max')
218
+ cluster_labels = perform_clustering(scaled_df, 'kmeans', {'n_clusters': n_clusters, 'max_clusters': max_clusters})
219
+ display_performance_metrics(scaled_df, cluster_labels)
220
+
221
+ with tab2:
222
+ st.header("DBSCAN Clustering")
223
+ if selected_algorithm == 'DBSCAN':
224
+ eps = st.sidebar.slider("Epsilon", 0.1, 1.0, 0.5, 0.1, key='eps')
225
+ min_samples = st.slider("Minimum Samples", 1, 10, 5, key='min_dbscan')
226
+ cluster_labels = perform_clustering(scaled_df, 'dbscan', {'eps': eps, 'min_samples': min_samples})
227
+ display_performance_metrics(scaled_df, cluster_labels)
228
+
229
+ with tab3:
230
+ st.header("Agglomerative Clustering")
231
+ if selected_algorithm == 'Agglomerative Clustering':
232
+ n_clusters = st.sidebar.slider("Number of Clusters", 2, 10, 3, key='agg_cluster')
233
+ cluster_labels = perform_clustering(scaled_df, 'agglomerative', {'n_clusters': n_clusters})
234
+ display_performance_metrics(scaled_df, cluster_labels)
235
+
236
+ with tab4:
237
+ st.header("Birch Clustering")
238
+ if selected_algorithm == 'Birch':
239
+ n_clusters = st.sidebar.slider("Number of Clusters", 2, 10, 3, key='birch_cluster')
240
+ cluster_labels = perform_clustering(scaled_df, 'birch', {'n_clusters': n_clusters})
241
+ display_performance_metrics(scaled_df, cluster_labels)
242
+
243
+ with tab5:
244
+ st.header("Mean Shift Clustering")
245
+ if selected_algorithm == 'Mean Shift':
246
+ bandwidth = st.sidebar.slider("Bandwidth", 0.1, 1.0, 0.5, 0.1, key='bandwidth')
247
+ cluster_labels = perform_clustering(scaled_df, 'meanshift', {'bandwidth': bandwidth})
248
+ display_performance_metrics(scaled_df, cluster_labels)
249
+
250
+ # Allow users to input new data points
251
+ new_data = input_new_data(scaled_df)
252
+ if st.sidebar.button("Predict Cluster for New Data"):
253
+ # Perform clustering on the new data point
254
+ if selected_algorithm == "K-Means":
255
+ params = {'n_clusters': n_clusters}
256
+ with tab1:
257
+ scaled_df_no_cluster = scaled_df.drop(columns=['Cluster']) if 'Cluster' in scaled_df.columns else scaled_df
258
+ cluster_label = perform_clustering(scaled_df_no_cluster, 'kmeans', params)
259
+ st.write(f"Predicted Cluster for K-Means: {cluster_label[0]}")
260
+
261
+ elif selected_algorithm == "DBSCAN":
262
+ params = {'eps': eps, 'min_samples': min_samples}
263
+ with tab2:
264
+ scaled_df_no_cluster = scaled_df.drop(columns=['Cluster']) if 'Cluster' in scaled_df.columns else scaled_df
265
+ cluster_label = perform_clustering(scaled_df_no_cluster, 'dbscan', params)
266
+ st.write(f"Predicted Cluster for DBSCAN: {cluster_label[0]}")
267
+
268
+ elif selected_algorithm == "Agglomerative Clustering":
269
+ params = {'n_clusters': n_clusters}
270
+ with tab3:
271
+ scaled_df_no_cluster = scaled_df.drop(columns=['Cluster']) if 'Cluster' in scaled_df.columns else scaled_df
272
+ cluster_label = perform_clustering(scaled_df_no_cluster, 'agglomerative', params)
273
+ st.write(f"Predicted Cluster for Agglomerative Clustering: {cluster_label[0]}")
274
+
275
+ elif selected_algorithm == "Birch":
276
+ params = {'n_clusters': n_clusters}
277
+ with tab4:
278
+ scaled_df_no_cluster = scaled_df.drop(columns=['Cluster']) if 'Cluster' in scaled_df.columns else scaled_df
279
+ cluster_label = perform_clustering(scaled_df_no_cluster, 'birch', params)
280
+ st.write(f"Predicted Cluster for Birch: {cluster_label[0]}")
281
+
282
+ elif selected_algorithm == "Mean Shift":
283
+ params = {'bandwidth': bandwidth}
284
+ with tab5:
285
+ scaled_df_no_cluster = scaled_df.drop(columns=['Cluster']) if 'Cluster' in scaled_df.columns else scaled_df
286
+ cluster_label = perform_clustering(scaled_df_no_cluster, 'meanshift', params)
287
+ st.write(f"Predicted Cluster for Mean Shift: {cluster_label[0]}")
288
+
289
+ # Download results
290
+ download_results(df)
291
+
292
+ if __name__ == "__main__":
293
+ main()
requirements.txt ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ streamlit
2
+ pandas
3
+ numpy
4
+ scikit-learn
5
+ matplotlib
6
+ seaborn