DhominickJ commited on
Commit
28a5f7d
·
1 Parent(s): a0f7bfa

Initial Commit for the Mall Customers Prediciton

Browse files
Files changed (3) hide show
  1. app.py +233 -0
  2. googleplaystoreapps.csv +0 -0
  3. requirements.txt +6 -0
app.py ADDED
@@ -0,0 +1,233 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import pandas as pd
3
+ import numpy as np
4
+ import matplotlib.pyplot as plt
5
+ import seaborn as sns
6
+ from sklearn.preprocessing import StandardScaler, LabelEncoder
7
+ from sklearn.cluster import KMeans, DBSCAN, AgglomerativeClustering
8
+ from sklearn.mixture import GaussianMixture
9
+ from sklearn.decomposition import PCA
10
+ from sklearn.metrics import silhouette_score
11
+ import plotly.express as px
12
+
13
+ # Function to load and preprocess the data
14
+ def load_and_preprocess_data(file_uploaded):
15
+ try:
16
+ df = pd.read_csv(file_uploaded)
17
+ df = df.dropna()
18
+
19
+ # Encode categorical variables
20
+ le = LabelEncoder()
21
+ categorical_columns = ['Category', 'Content Rating', 'Genres', 'Type']
22
+ for col in categorical_columns:
23
+ df[col + '_encoded'] = le.fit_transform(df[col])
24
+
25
+ # Replace 'Varies with device' with mean size
26
+ df['Size'] = df['Size'].replace('Varies with device', df[df['Size'] != 'Varies with device']['Size'].mode()[0])
27
+
28
+ # Convert 'Size' to numeric
29
+ df['Size'] = df['Size'].apply(lambda x: float(str(x).replace('M', '')) if 'M' in str(x) else float(str(x).replace('k', '')) / 1000)
30
+
31
+ # Convert 'Installs' to numeric
32
+ df['Installs'] = df['Installs'].apply(lambda x: int(str(x).replace('+', '').replace(',', '')))
33
+
34
+ # Convert 'Price' to numeric
35
+ df['Price'] = df['Price'].apply(lambda x: float(str(x).replace('$', '')))
36
+
37
+ # Select relevant features for clustering
38
+ features = ['Category', 'Rating', 'Reviews', 'Size', 'Installs', 'Type', 'Price', 'Content Rating', 'Genres']
39
+ df_features = df[features]
40
+ df = df_features.copy()
41
+
42
+ # Separate numerical and encoded categorical features
43
+ numerical_features = ['Rating', 'Reviews', 'Size', 'Installs', 'Price']
44
+ categorical_encoded = [col + '_encoded' for col in categorical_columns]
45
+
46
+ # Scale only numerical features
47
+ scaler = StandardScaler()
48
+ df_scaled = pd.DataFrame(scaler.fit_transform(df[numerical_features]), columns=numerical_features)
49
+
50
+ # Add encoded categorical features to scaled data
51
+ for col, base_col in zip(categorical_encoded, categorical_columns):
52
+ df_scaled[col] = le.fit_transform(df[base_col])
53
+
54
+ scaled_data = df_scaled.values
55
+
56
+ return df, scaled_data, scaler
57
+ except Exception as e:
58
+ st.error(f"Error loading and preprocessing data: {e}")
59
+
60
+ # Function to implement KMeans
61
+ def kmeans_clustering(scaled_data, n_clusters):
62
+ kmeans = KMeans(n_clusters=n_clusters, random_state=42)
63
+ kmeans.fit(scaled_data)
64
+ return kmeans.labels_, kmeans
65
+
66
+ # Function to implement DBSCAN
67
+ def dbscan_clustering(scaled_data, eps, min_samples):
68
+ dbscan = DBSCAN(eps=eps, min_samples=min_samples)
69
+ dbscan.fit(scaled_data)
70
+ return dbscan.labels_, dbscan
71
+
72
+ # Function to implement Agglomerative Clustering
73
+ def agglomerative_clustering(scaled_data, n_clusters):
74
+ agglomerative = AgglomerativeClustering(n_clusters=n_clusters)
75
+ agglomerative.fit(scaled_data)
76
+ return agglomerative.labels_, agglomerative
77
+
78
+ # Function to implement Gaussian Mixture Model
79
+ def gaussian_mixture_clustering(scaled_data, n_components):
80
+ gmm = GaussianMixture(n_components=n_components, random_state=42)
81
+ gmm.fit(scaled_data)
82
+ return gmm.predict(scaled_data), gmm
83
+
84
+ # Function to plot scatter plot
85
+ def plot_scatter(df, labels, title, scaled_data):
86
+ pca = PCA(n_components=2)
87
+ reduced_data = pca.fit_transform(scaled_data)
88
+ df_pca = pd.DataFrame(reduced_data, columns=['PC1', 'PC2'])
89
+ df_pca['Cluster'] = labels
90
+ fig = px.scatter(df_pca, x='PC1', y='PC2', color='Cluster', title=title)
91
+ st.plotly_chart(fig)
92
+
93
+ # Function to plot elbow curve
94
+ def plot_elbow_curve(scaled_data, max_clusters):
95
+ wcss = []
96
+ for i in range(1, max_clusters + 1):
97
+ kmeans = KMeans(n_clusters=i, random_state=42)
98
+ kmeans.fit(scaled_data)
99
+ wcss.append(kmeans.inertia_)
100
+ fig, ax = plt.subplots()
101
+ ax.plot(range(1, max_clusters + 1), wcss, marker='o')
102
+ ax.set_title('Elbow Curve')
103
+ ax.set_xlabel('Number of Clusters')
104
+ ax.set_ylabel('WCSS')
105
+ st.pyplot(fig)
106
+
107
+ # Function to display performance metrics
108
+ def display_performance_metrics(labels, scaled_data):
109
+ if len(set(labels)) > 1:
110
+ silhouette = silhouette_score(scaled_data, labels)
111
+ st.write(f"Silhouette Score: {silhouette:.2f}")
112
+ else:
113
+ st.write("Silhouette Score cannot be computed for a single cluster.")
114
+
115
+ # Define categorical columns globally
116
+ categorical_columns = ['Category', 'Content Rating', 'Genres', 'Type']
117
+
118
+ # Main function
119
+ def main():
120
+ st.title("Unsupervised Learning for App Recommendation")
121
+
122
+ # File upload
123
+ file = st.sidebar.file_uploader("Upload CSV file", type=["csv"])
124
+ if file is None:
125
+ file = './googleplaystoreapps.csv'
126
+ if file is not None:
127
+ # Sidebar for parameter tuning
128
+ st.sidebar.header("Upload Custom Data Here")
129
+ df, scaled_data, scaler = load_and_preprocess_data(file)
130
+ st.sidebar.header("Parameter Tuning")
131
+ n_clusters = st.sidebar.slider("Number of Clusters", 2, 10, 3)
132
+ eps = st.sidebar.slider("Epsilon (DBSCAN)", 0.1, 1.0, 0.5, 0.1)
133
+ min_samples = st.sidebar.slider("Minimum Samples (DBSCAN)", 1, 10, 5)
134
+ n_components = st.sidebar.slider("Number of Components (GMM)", 2, 10, 3)
135
+
136
+ # Tabs for different algorithms
137
+ tab1, tab2, tab3, tab4, tab5 = st.tabs(["KMeans", "DBSCAN", "Agglomerative Clustering", "Gaussian Mixture Model", "Feature Correlation"])
138
+
139
+ with tab1:
140
+ st.header("KMeans Clustering")
141
+ labels, kmeans = kmeans_clustering(scaled_data, n_clusters)
142
+ plot_scatter(df, labels, "KMeans Clustering", scaled_data)
143
+ display_performance_metrics(labels, scaled_data)
144
+ plot_elbow_curve(scaled_data, 10)
145
+
146
+ with tab2:
147
+ st.header("DBSCAN Clustering")
148
+ labels, dbscan = dbscan_clustering(scaled_data, eps, min_samples)
149
+ plot_scatter(df, labels, "DBSCAN Clustering", scaled_data)
150
+ display_performance_metrics(labels, scaled_data)
151
+
152
+ with tab3:
153
+ st.header("Agglomerative Clustering")
154
+ labels, agglomerative = agglomerative_clustering(scaled_data, n_clusters)
155
+ plot_scatter(df, labels, "Agglomerative Clustering", scaled_data)
156
+ display_performance_metrics(labels, scaled_data)
157
+
158
+ with tab4:
159
+ st.header("Gaussian Mixture Model")
160
+ labels, gmm = gaussian_mixture_clustering(scaled_data, n_components)
161
+ plot_scatter(df, labels, "Gaussian Mixture Model", scaled_data)
162
+ display_performance_metrics(labels, scaled_data)
163
+
164
+ with tab5:
165
+ st.header("Feature Correlation Analysis")
166
+ numerical_df = df.select_dtypes(include=[np.number])
167
+ corr_matrix = numerical_df.corr()
168
+ fig, ax = plt.subplots(figsize=(10, 8))
169
+ sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', ax=ax)
170
+ st.pyplot(fig)
171
+
172
+ # User input for prediction
173
+ st.sidebar.header("Input New Data Point")
174
+ new_data = {}
175
+ # Store the original categorical values before encoding
176
+ original_values = {}
177
+ le_dict = {}
178
+ for col in categorical_columns:
179
+ le = LabelEncoder()
180
+ original_values[col] = df[col].unique()
181
+ le_dict[col] = le.fit(original_values[col])
182
+
183
+ for col in df.columns:
184
+ if col in categorical_columns:
185
+ # Use original values for display but store encoded value
186
+ selected_value = st.sidebar.selectbox(f"Select {col}", original_values[col])
187
+ new_data[col] = le_dict[col].transform([selected_value])[0]
188
+ else:
189
+ mean_value = np.clip(df[col].mean(), 1.0, 5.0)
190
+ new_data[col] = st.sidebar.number_input(f"Enter {col}", value=float(mean_value))
191
+
192
+ new_data_df = pd.DataFrame([new_data])
193
+ # Scale the numerical features of the new data point
194
+ numerical_features = ['Rating', 'Reviews', 'Size', 'Installs', 'Price']
195
+ new_data_numerical = new_data_df[numerical_features]
196
+ new_data_scaled = scaler.transform(new_data_numerical)
197
+
198
+ # Add encoded categorical features
199
+ new_data_scaled = np.hstack([
200
+ new_data_scaled,
201
+ new_data_df[[col for col in new_data_df.columns if col in categorical_columns]].values
202
+ ])
203
+
204
+ # Predict cluster for new data point
205
+ st.sidebar.header("Cluster Prediction")
206
+ if st.sidebar.button("Predict"):
207
+ kmeans_label = kmeans.predict(new_data_scaled)
208
+ dbscan_label = dbscan.fit_predict(new_data_scaled)
209
+ agglomerative_label = [-1]
210
+ gmm_label = gmm.predict(new_data_scaled)
211
+
212
+ # Find similar apps based on cluster
213
+ kmeans_cluster_apps = df[kmeans.labels_ == kmeans_label[0]]
214
+ gmm_cluster_apps = df[gmm.predict(scaled_data) == gmm_label[0]]
215
+
216
+ st.sidebar.write(f"KMeans Cluster: {kmeans_label[0]}")
217
+ st.sidebar.write(f"DBSCAN Cluster: {dbscan_label[0]}")
218
+ st.sidebar.write(f"Agglomerative Cluster: {agglomerative_label[0]}")
219
+ st.sidebar.write(f"GMM Cluster: {gmm_label[0]}")
220
+
221
+ # Download results
222
+ st.sidebar.header("Download Results")
223
+ if st.sidebar.button("Download Results"):
224
+ results = pd.DataFrame({
225
+ 'Cluster (KMeans)': labels,
226
+ 'Cluster (DBSCAN)': dbscan.labels_,
227
+ 'Cluster (Agglomerative)': agglomerative.labels_,
228
+ 'Cluster (GMM)': gmm.predict(scaled_data)
229
+ })
230
+ st.sidebar.download_button("Download CSV", results.to_csv(index=False), "results.csv")
231
+
232
+ if __name__ == "__main__":
233
+ main()
googleplaystoreapps.csv ADDED
The diff for this file is too large to render. See raw diff
 
requirements.txt ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ streamlit
2
+ pandas
3
+ numpy
4
+ scikit-learn
5
+ matplotlib
6
+ seaborn