louiecerv commited on
Commit
5c5a3b6
·
1 Parent(s): 989f9c0

sync with remote

Browse files
Files changed (4) hide show
  1. app.py +437 -0
  2. marketing_campaign.csv +0 -0
  3. oldapp.py +185 -0
  4. requirements.txt +5 -0
app.py ADDED
@@ -0,0 +1,437 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import pandas as pd
3
+ from sklearn.preprocessing import StandardScaler, MinMaxScaler
4
+ from sklearn.decomposition import PCA
5
+ from sklearn.cluster import KMeans, AgglomerativeClustering, DBSCAN
6
+ from sklearn.mixture import GaussianMixture
7
+ from sklearn.metrics import silhouette_score, adjusted_rand_score
8
+ from sklearn.ensemble import RandomForestClassifier
9
+ import matplotlib.pyplot as plt
10
+ import seaborn as sns
11
+ import io
12
+ import numpy as np
13
+
14
+ # Function to load the dataset with st.spinner
15
+ @st.cache_data # Cache the data to speed up subsequent runs
16
+ def load_data():
17
+ with st.spinner("Loading data..."):
18
+ df = pd.read_csv("marketing_campaign.csv", delimiter='\t')
19
+ return df
20
+
21
+ def handle_mixed_types(df):
22
+ for col in df.columns:
23
+ unique_types = df[col].apply(type).unique()
24
+ if len(unique_types) > 1: # Check if there are mixed types
25
+ # If mixed numeric types (int and float), convert to float
26
+ if all(issubclass(t, (int, float)) for t in unique_types):
27
+ df[col] = df[col].astype(float)
28
+ # Otherwise, convert to string (e.g., for mixed numeric and string types)
29
+ else:
30
+ df[col] = df[col].astype(str)
31
+ return df
32
+
33
+ def handle_nulls(df):
34
+ for col in df.columns:
35
+ if df[col].dtype == 'object':
36
+ df[col] = df[col].fillna(df[col].mode()[0]) # Explicit assignment for categorical
37
+ else:
38
+ df[col] = df[col].fillna(df[col].mean()) # Explicit assignment for numerical
39
+ return df
40
+
41
+ # Function to check data type consistency
42
+ def check_data_types(df):
43
+ df['Dt_Customer'] = pd.to_datetime(df['Dt_Customer'], dayfirst=True)
44
+ return df
45
+
46
+ # Function to detect and remove outliers based on income
47
+ def remove_outliers(df):
48
+ Q1 = df['Income'].quantile(0.25)
49
+ Q3 = df['Income'].quantile(0.75)
50
+ IQR = Q3 - Q1
51
+ lower_bound = Q1 - 1.5 * IQR
52
+ upper_bound = Q3 + 1.5 * IQR
53
+ df = df[(df['Income'] >= lower_bound) & (df['Income'] <= upper_bound)]
54
+ return df
55
+
56
+ # Function to visualize data distribution
57
+ def visualize_data(df):
58
+ st.subheader("Data Visualization")
59
+ # Select top 3 columns with highest variance (excluding date and object types)
60
+ numerical_df = df.select_dtypes(exclude=['object', 'datetime']) # Exclude datetime columns
61
+ top_3_cols = numerical_df.var().sort_values(ascending=False).head(3).index.tolist()
62
+ for col in top_3_cols:
63
+ if df[col].dtype == 'object':
64
+ plt.figure(figsize=(10, 5))
65
+ sns.countplot(x=col, data=df)
66
+ plt.xticks(rotation=45)
67
+ # Convert plot to image
68
+ img = io.BytesIO()
69
+ plt.savefig(img, format='png')
70
+ img.seek(0)
71
+ st.image(img) # Display the image
72
+ else:
73
+ plt.figure(figsize=(10, 5))
74
+ sns.histplot(x=col, data=df, kde=True)
75
+ # Convert plot to image
76
+ img = io.BytesIO()
77
+ plt.savefig(img, format='png')
78
+ img.seek(0)
79
+ st.image(img) # Display the image
80
+
81
+ plt.figure(figsize=(10, 5))
82
+ sns.histplot(x=df["Response"], data=df, kde=True)
83
+ # Convert plot to image
84
+ img = io.BytesIO()
85
+ plt.savefig(img, format='png')
86
+ img.seek(0)
87
+ st.image(img) # Display the image
88
+
89
+ # Preprocess data with PCA to exclude columns that do not contribute to clustering
90
+ def preprocess_data_with_pca_exclusion(df):
91
+ categorical_cols = df.select_dtypes(include=['object']).columns.tolist()
92
+ df_encoded = pd.get_dummies(df, columns=categorical_cols)
93
+ X = df_encoded.drop(columns=['Response'])
94
+ X['Dt_Customer_Year'] = X['Dt_Customer'].dt.year
95
+ X['Dt_Customer_Month'] = X['Dt_Customer'].dt.month
96
+ X['Dt_Customer_Day'] = X['Dt_Customer'].dt.day
97
+ X = X.drop(columns=['Dt_Customer'])
98
+
99
+ # MinMax scale numerical features
100
+ scaler = MinMaxScaler()
101
+ numerical_cols = X.select_dtypes(include=['number']).columns.tolist()
102
+ X[numerical_cols] = scaler.fit_transform(X[numerical_cols])
103
+
104
+ # Feature importance analysis using Random Forest
105
+ y = df['Response']
106
+ model = RandomForestClassifier(n_estimators=100, random_state=42)
107
+ model.fit(X, y)
108
+ feature_importances = model.feature_importances_
109
+ important_features = np.argsort(feature_importances)[-int(0.5 * len(feature_importances)):] # Retain top 50% features
110
+
111
+ # Create a new dataframe with only the important features
112
+ X_important = X.iloc[:, important_features]
113
+
114
+ # Apply PCA to retain components that explain 90% of the variance
115
+ pca = PCA(n_components=0.90)
116
+ X_pca = pca.fit_transform(X_important)
117
+
118
+ # Get the columns that contribute to the PCA components
119
+ pca_columns = pca.components_.argsort()[:, -1:-X_pca.shape[1]-1:-1]
120
+
121
+ # Get the original column names that contribute to the PCA components
122
+ contributing_columns = [X_important.columns[i] for i in pca_columns.flatten()]
123
+
124
+ # Drop duplicate columns and keep only those that contribute to the PCA components
125
+ contributing_columns = list(dict.fromkeys(contributing_columns))
126
+
127
+ # Create a new dataframe with only the contributing columns
128
+ X_contributing = X_important[contributing_columns]
129
+
130
+ return X_contributing, df['Response']
131
+
132
+ # Function to run K-Means clustering
133
+ def run_kmeans(X, y_true):
134
+ kmeans = KMeans(n_clusters=2, random_state=42) # Example: 5 clusters
135
+ y_pred = kmeans.fit_predict(X)
136
+ n_clusters = kmeans.n_clusters
137
+ silhouette = silhouette_score(X, y_pred)
138
+ # Check for number of unique labels before calculating Rand Index
139
+ if len(set(y_pred)) > 1:
140
+ rand_index = adjusted_rand_score(y_true, y_pred)
141
+ else:
142
+ rand_index = "N/A (Only one cluster found)"
143
+ return n_clusters, silhouette, rand_index
144
+
145
+ # Function to run Hierarchical clustering
146
+ def run_hierarchical(X, y_true):
147
+ hierarchical = AgglomerativeClustering(n_clusters=2)
148
+ y_pred = hierarchical.fit_predict(X)
149
+ n_clusters = hierarchical.n_clusters
150
+ silhouette = silhouette_score(X, y_pred)
151
+ # Check for number of unique labels before calculating Rand Index
152
+ if len(set(y_pred)) > 1:
153
+ rand_index = adjusted_rand_score(y_true, y_pred)
154
+ else:
155
+ rand_index = "N/A (Only one cluster found)"
156
+ return n_clusters, silhouette, rand_index
157
+
158
+ # Function to run DBSCAN clustering
159
+ def run_dbscan(X, y_true):
160
+ dbscan = DBSCAN(eps=1.0, min_samples=6)
161
+ y_pred = dbscan.fit_predict(X)
162
+ n_clusters = len(set(y_pred)) - (1 if -1 in y_pred else 0) # Adjust for noise
163
+ # Check for number of unique labels before calculating Silhouette and Rand Index
164
+ if n_clusters > 1:
165
+ silhouette = silhouette_score(X, y_pred)
166
+ rand_index = adjusted_rand_score(y_true, y_pred)
167
+ else:
168
+ silhouette = "N/A (Only one cluster found)"
169
+ rand_index = "N/A (Only one cluster found)"
170
+ return n_clusters, silhouette, rand_index
171
+
172
+ # Function to run Gaussian Mixture clustering
173
+ def run_gaussian_mixture(X, y_true):
174
+ gaussian_mixture = GaussianMixture(n_components=2, random_state=42) # Example: 5 components
175
+ y_pred = gaussian_mixture.fit_predict(X)
176
+ n_clusters = gaussian_mixture.n_components
177
+ silhouette = silhouette_score(X, y_pred)
178
+ # Check for number of unique labels before calculating Rand Index
179
+ if len(set(y_pred)) > 1:
180
+ rand_index = adjusted_rand_score(y_true, y_pred)
181
+ else:
182
+ rand_index = "N/A (Only one cluster found)"
183
+ return n_clusters, silhouette, rand_index
184
+
185
+ # Main Streamlit app
186
+ def main():
187
+ st.title("Customer Segmentation Analysis App")
188
+
189
+ with st.expander("About this App"):
190
+ st.markdown("""
191
+ ## About this App
192
+
193
+ This app is designed to analyze customer data from a marketing campaign and determine customer segmentation using k-means clustering and other machine learning models.
194
+
195
+ ### Dataset Information
196
+
197
+ - **Number of Records:** 2,240
198
+ - **Number of Features:** 29
199
+
200
+ ### Features:
201
+
202
+ - **ID:** Unique identifier for each customer.
203
+ - **Year_Birth:** Year of birth of the customer.
204
+ - **Education:** Education level of the customer.
205
+ - **Marital_Status:** Marital status of the customer.
206
+ - **Income:** Annual income of the customer.
207
+ - **Kidhome:** Number of small children in customer's household.
208
+ - **Teenhome:** Number of teenagers in customer's household.
209
+ - **Dt_Customer:** Date of customer's enrollment with the company.
210
+ - **Recency:** Number of days since the customer's last purchase.
211
+ - **MntWines:** Amount spent on wine in the last 2 years.
212
+ - **MntFruits:** Amount spent on fruits in the last 2 years.
213
+ - **MntMeatProducts:** Amount spent on meat products in the last 2 years.
214
+ - **MntFishProducts:** Amount spent on fish products in the last 2 years.
215
+ - **MntSweetProducts:** Amount spent on sweet products in the last 2 years.
216
+ - **MntGoldProds:** Amount spent on gold products in the last 2 years.
217
+ - **NumDealsPurchases:** Number of purchases made with a discount.
218
+ - **NumWebPurchases:** Number of purchases made through the company's website.
219
+ - **NumCatalogPurchases:** Number of purchases made using a catalogue.
220
+ - **NumStorePurchases:** Number of purchases made directly in stores.
221
+ - **NumWebVisitsMonth:** Number of visits to company's website in the last month.
222
+ - **AcceptedCmp3, AcceptedCmp4, AcceptedCmp5, AcceptedCmp1, AcceptedCmp2:** 1 if customer accepted the offer in the respective campaign, 0 otherwise.
223
+ - **Complain:** 1 if customer complained in the last 2 years, 0 otherwise.
224
+ - **Z_CostContact, Z_Revenue:** Internal features related to cost and revenue.
225
+ - **Response:** 1 if customer accepted the offer in the last campaign, 0 otherwise.
226
+
227
+ ### Analysis
228
+
229
+ The app will use k-means clustering and other machine learning models to determine customer segmentation based on their purchasing behavior and demographic information. This will help in identifying distinct groups of customers and tailoring marketing strategies accordingly.
230
+
231
+ Created by: Louie F. Cervantes, M.Eng. (Information Engineering)
232
+ """)
233
+
234
+ # Load data
235
+ df = load_data()
236
+
237
+ # Data cleaning and validation
238
+ #df = handle_mixed_types(df)
239
+ df = handle_nulls(df)
240
+ df = check_data_types(df)
241
+ df = handle_mixed_types(df)
242
+
243
+ # Remove outliers based on income
244
+ df = remove_outliers(df)
245
+
246
+ # Visualize data
247
+ visualize_data(df)
248
+
249
+ # Preprocess data
250
+ X_contributing, y_true = preprocess_data_with_pca_exclusion(df)
251
+
252
+ st.write("Optimizing Clustering...")
253
+
254
+ st.write("Columns contributing to clustering:")
255
+ st.write(X_contributing.columns.tolist())
256
+
257
+ st.write("\nFirst few rows of the preprocessed data:")
258
+ st.write(X_contributing.head())
259
+
260
+ # Evaluate the optimal number of clusters using silhouette score
261
+ silhouette_scores = []
262
+ for n_clusters in range(2, 11):
263
+ kmeans = KMeans(n_clusters=n_clusters, random_state=42)
264
+ y_pred = kmeans.fit_predict(X_contributing)
265
+ silhouette_scores.append(silhouette_score(X_contributing, y_pred))
266
+
267
+ optimal_n_clusters = range(2, 11)[silhouette_scores.index(max(silhouette_scores))]
268
+ optimal_silhouette_score = max(silhouette_scores)
269
+
270
+ st.write(f"Optimal number of clusters: {optimal_n_clusters}")
271
+ st.write(f"Optimal silhouette score: {optimal_silhouette_score}")
272
+
273
+ # Evaluate the explained variance ratio for PCA components
274
+ explained_variance_ratio = PCA(n_components=0.95).fit(X_contributing).explained_variance_ratio_
275
+ st.write(f"Explained variance ratio for PCA components: {explained_variance_ratio}")
276
+
277
+ # Evaluate the scaling method (MinMaxScaler vs StandardScaler)
278
+ scalers = [MinMaxScaler(), StandardScaler()]
279
+ scaler_names = ['MinMaxScaler', 'StandardScaler']
280
+ scaler_scores = []
281
+
282
+ for scaler, name in zip(scalers, scaler_names):
283
+ X_scaled = scaler.fit_transform(X_contributing)
284
+ kmeans = KMeans(n_clusters=optimal_n_clusters, random_state=42)
285
+ y_pred = kmeans.fit_predict(X_scaled)
286
+ score = silhouette_score(X_scaled, y_pred)
287
+ scaler_scores.append((name, score))
288
+
289
+ best_scaler_name, best_scaler_score = max(scaler_scores, key=lambda x: x[1])
290
+
291
+ st.write(f"Best scaling method: {best_scaler_name}")
292
+ st.write(f"Best silhouette score with scaling: {best_scaler_score}")
293
+
294
+ # Define the parameter grid for DBSCAN
295
+ param_grid = {
296
+ 'eps': np.arange(0.1, 1.1, 0.1),
297
+ 'min_samples': np.arange(2, 11, 1)
298
+ }
299
+
300
+ # Initialize DBSCAN model
301
+ dbscan_model = DBSCAN()
302
+
303
+ # Perform grid search with silhouette score as the evaluation metric
304
+ best_score = -1
305
+ best_params = None
306
+
307
+ for eps in param_grid['eps']:
308
+ for min_samples in param_grid['min_samples']:
309
+ dbscan_model.set_params(eps=eps, min_samples=min_samples)
310
+ y_pred = dbscan_model.fit_predict(X_contributing)
311
+ n_clusters = len(set(y_pred)) - (1 if -1 in y_pred else 0)
312
+ if n_clusters > 1:
313
+ silhouette = silhouette_score(X_contributing, y_pred)
314
+ if silhouette > best_score:
315
+ best_score = silhouette
316
+ best_params = {'eps': eps, 'min_samples': min_samples}
317
+
318
+ st.write(f"Best parameters: {best_params}")
319
+ st.write(f"Best silhouette score: {best_score}")
320
+
321
+ # Run DBSCAN with the best parameters
322
+ dbscan_model.set_params(**best_params)
323
+ y_pred_best = dbscan_model.fit_predict(X_contributing)
324
+ n_clusters_best = len(set(y_pred_best)) - (1 if -1 in y_pred_best else 0)
325
+
326
+ if n_clusters_best > 1:
327
+ silhouette_best = silhouette_score(X_contributing, y_pred_best)
328
+ rand_index_best = adjusted_rand_score(y_true, y_pred_best)
329
+ else:
330
+ silhouette_best = "N/A (Only one cluster found)"
331
+ rand_index_best = "N/A (Only one cluster found)"
332
+
333
+ st.write(f"Number of Clusters: {n_clusters_best}")
334
+ st.write(f"Silhouette Score: {silhouette_best}")
335
+ st.write(f"Rand Index: {rand_index_best}")
336
+
337
+ # Create tabs
338
+ tab1, tab2, tab3, tab4 = st.tabs(["K-Means", "Hierarchical", "DBSCAN", "Gaussian Mixture"])
339
+
340
+ # Tab 1: K-Means
341
+ with tab1:
342
+ n_clusters, silhouette, rand_index = run_kmeans(X_contributing, y_true)
343
+ st.write(f"Number of Clusters: {n_clusters}")
344
+ st.write(f"Silhouette Score: {silhouette:.3f}")
345
+ st.write(f"Rand Index: {rand_index}")
346
+
347
+ # Tab 2: Hierarchical
348
+ with tab2:
349
+ n_clusters, silhouette, rand_index = run_hierarchical(X_contributing, y_true)
350
+ st.write(f"Number of Clusters: {n_clusters}")
351
+ st.write(f"Silhouette Score: {silhouette:.3f}")
352
+ st.write(f"Rand Index: {rand_index}")
353
+
354
+ # Tab 3: DBSCAN
355
+ with tab3:
356
+ n_clusters, silhouette, rand_index = run_dbscan(X_contributing, y_true)
357
+ st.write(f"Number of Clusters: {n_clusters}")
358
+ st.write(f"Silhouette Score: {silhouette}")
359
+ st.write(f"Rand Index: {rand_index}")
360
+
361
+ # Tab 4: Gaussian Mixture
362
+ with tab4:
363
+ n_clusters, silhouette, rand_index = run_gaussian_mixture(X_contributing, y_true)
364
+ st.write(f"Number of Clusters: {n_clusters}")
365
+ st.write(f"Silhouette Score: {silhouette:.3f}")
366
+ st.write(f"Rand Index: {rand_index}")
367
+
368
+ st.header("Predict Customer Cluster")
369
+
370
+ # Create a form for user input
371
+ with st.form(key='customer_form'):
372
+ year_birth = st.number_input('Year of Birth', min_value=1900, max_value=2023, value=1980)
373
+ education = st.selectbox('Education Level', df['Education'].unique())
374
+ marital_status = st.selectbox('Marital Status', df['Marital_Status'].unique())
375
+ income = st.number_input('Annual Income', min_value=0, value=50000)
376
+ kidhome = st.number_input('Number of Small Children', min_value=0, max_value=10, value=0)
377
+ teenhome = st.number_input('Number of Teenagers', min_value=0, max_value=10, value=0)
378
+ recency = st.number_input('Recency (days since last purchase)', min_value=0, value=30)
379
+
380
+ mnt_wines = st.number_input('Amount Spent on Wine', min_value=0, value=100)
381
+ mnt_fruits = st.number_input('Amount Spent on Fruits', min_value=0, value=50)
382
+ mnt_meat_products = st.number_input('Amount Spent on Meat Products', min_value=0, value=200)
383
+ mnt_fish_products = st.number_input('Amount Spent on Fish Products', min_value=0, value=50)
384
+ mnt_sweet_products = st.number_input('Amount Spent on Sweet Products', min_value=0, value=50)
385
+ mnt_gold_prods = st.number_input('Amount Spent on Gold Products', min_value=0, value=100)
386
+
387
+ num_deals_purchases = st.number_input('Number of Purchases with Discount', min_value=0, value=5)
388
+ num_web_purchases = st.number_input('Number of Web Purchases', min_value=0, value=5)
389
+ num_catalog_purchases = st.number_input('Number of Catalog Purchases', min_value=0, value=5)
390
+ num_store_purchases = st.number_input('Number of Store Purchases', min_value=0, value=5)
391
+
392
+ num_web_visits_month = st.number_input('Number of Web Visits per Month', min_value=0, value=5)
393
+
394
+ submit_button = st.form_submit_button(label='Predict Cluster')
395
+
396
+ # Predict cluster when form is submitted
397
+ if submit_button:
398
+
399
+ # Create a dataframe from user input
400
+ user_data = pd.DataFrame({
401
+ 'Year_Birth': [year_birth],
402
+ 'Education': [education],
403
+ 'Marital_Status': [marital_status],
404
+ 'Income': [income],
405
+ 'Kidhome': [kidhome],
406
+ 'Teenhome': [teenhome],
407
+ 'Recency': [recency],
408
+ 'MntWines': [mnt_wines],
409
+ 'MntFruits': [mnt_fruits],
410
+ 'MntMeatProducts': [mnt_meat_products],
411
+ 'MntFishProducts': [mnt_fish_products],
412
+ 'MntSweetProducts': [mnt_sweet_products],
413
+ 'MntGoldProds': [mnt_gold_prods],
414
+ 'NumDealsPurchases': [num_deals_purchases],
415
+ 'NumWebPurchases': [num_web_purchases],
416
+ 'NumCatalogPurchases': [num_catalog_purchases],
417
+ 'NumStorePurchases': [num_store_purchases],
418
+ 'NumWebVisitsMonth': [num_web_visits_month]
419
+ })
420
+
421
+ # One-hot encode user input
422
+ user_data_encoded = pd.get_dummies(user_data, columns=['Education', 'Marital_Status'])
423
+
424
+ # Align the encoded user data with the training data
425
+ user_data_encoded = user_data_encoded.reindex(columns=X_contributing.columns, fill_value=0)
426
+
427
+ # Standardize the user input
428
+ user_data_scaled = scaler.transform(user_data_encoded)
429
+
430
+ # Predict the cluster
431
+ cluster = kmeans.predict(user_data_scaled)
432
+
433
+ st.write(f'The predicted customer cluster is: {cluster[0]}')
434
+
435
+ st.markdown("""(c) 2025 West Visayas State University - College of Information and Communications Technology""")
436
+ if __name__ == "__main__":
437
+ main()
marketing_campaign.csv ADDED
The diff for this file is too large to render. See raw diff
 
oldapp.py ADDED
@@ -0,0 +1,185 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import pandas as pd
3
+ from sklearn.preprocessing import StandardScaler, MinMaxScaler
4
+ from sklearn.decomposition import PCA
5
+ from sklearn.cluster import KMeans, AgglomerativeClustering, DBSCAN
6
+ from sklearn.mixture import GaussianMixture
7
+ from sklearn.metrics import silhouette_score, adjusted_rand_score
8
+ import matplotlib.pyplot as plt
9
+ import seaborn as sns
10
+ import io
11
+
12
+ # Function to load the dataset with st.spinner
13
+ @st.cache_data # Cache the data to speed up subsequent runs
14
+ def load_data():
15
+ with st.spinner("Loading data..."):
16
+ df = pd.read_csv("marketing_campaign.csv", delimiter='\t')
17
+ return df
18
+
19
+ def handle_mixed_types(df):
20
+ for col in df.columns:
21
+ unique_types = df[col].apply(type).unique()
22
+ if len(unique_types) > 1: # Check if there are mixed types
23
+ # If mixed numeric types (int and float), convert to float
24
+ if all(issubclass(t, (int, float)) for t in unique_types):
25
+ df[col] = df[col].astype(float)
26
+ # Otherwise, convert to string (e.g., for mixed numeric and string types)
27
+ else:
28
+ df[col] = df[col].astype(str)
29
+ return df
30
+
31
+ def handle_nulls(df):
32
+ for col in df.columns:
33
+ if df[col].dtype == 'object':
34
+ df[col] = df[col].fillna(df[col].mode()) # Explicit assignment for categorical
35
+ else:
36
+ df[col] = df[col].fillna(df[col].mean()) # Explicit assignment for numerical
37
+ return df
38
+
39
+ # Function to check data type consistency
40
+ def check_data_types(df):
41
+ df['Dt_Customer'] = pd.to_datetime(df['Dt_Customer'], dayfirst=True)
42
+ return df
43
+
44
+ # Function to visualize data distribution
45
+ def visualize_data(df):
46
+ st.subheader("Data Visualization")
47
+ # Select top 3 columns with highest variance (excluding date and object types)
48
+ numerical_df = df.select_dtypes(exclude=['object', 'datetime']) # Exclude datetime columns
49
+ top_3_cols = numerical_df.var().sort_values(ascending=False).head(3).index.tolist()
50
+ for col in top_3_cols:
51
+ if df[col].dtype == 'object':
52
+ plt.figure(figsize=(10, 5))
53
+ sns.countplot(x=col, data=df)
54
+ plt.xticks(rotation=45)
55
+ # Convert plot to image
56
+ img = io.BytesIO()
57
+ plt.savefig(img, format='png')
58
+ img.seek(0)
59
+ st.image(img) # Display the image
60
+ else:
61
+ plt.figure(figsize=(10, 5))
62
+ sns.histplot(x=col, data=df, kde=True)
63
+ # Convert plot to image
64
+ img = io.BytesIO()
65
+ plt.savefig(img, format='png')
66
+ img.seek(0)
67
+ st.image(img) # Display the image
68
+
69
+ # Function to preprocess data with PCA
70
+ def preprocess_data_with_pca(df):
71
+ st.subheader("Preprocessed Data with PCA")
72
+ # One-hot encode categorical features
73
+ categorical_cols = df.select_dtypes(include=['object']).columns.tolist()
74
+ df_encoded = pd.get_dummies(df, columns=categorical_cols)
75
+ # Drop 'Response' column for clustering
76
+ X = df_encoded.drop(columns=['Response'])
77
+ X['Dt_Customer_Year'] = X['Dt_Customer'].dt.year
78
+ X['Dt_Customer_Month'] = X['Dt_Customer'].dt.month
79
+ X = X.drop(columns=['Dt_Customer'])
80
+ # MinMax scale numerical features
81
+ scaler = MinMaxScaler()
82
+ numerical_cols = X.select_dtypes(include=['number']).columns.tolist()
83
+ X[numerical_cols] = scaler.fit_transform(X[numerical_cols])
84
+ # Apply PCA
85
+ pca = PCA(n_components=0.95) # Retain 95% of variance
86
+ X_pca = pca.fit_transform(X)
87
+ st.write(pd.DataFrame(X_pca).head())
88
+ return X_pca, df['Response']
89
+
90
+ # Function to run K-Means clustering
91
+ def run_kmeans(X, y_true):
92
+ kmeans = KMeans(n_clusters=5, random_state=42) # Example: 5 clusters
93
+ y_pred = kmeans.fit_predict(X)
94
+ n_clusters = kmeans.n_clusters
95
+ silhouette = silhouette_score(X, y_pred)
96
+ # Check for number of unique labels before calculating Rand Index
97
+ if len(set(y_pred)) > 1:
98
+ rand_index = adjusted_rand_score(y_true, y_pred)
99
+ else:
100
+ rand_index = "N/A (Only one cluster found)"
101
+ return n_clusters, silhouette, rand_index
102
+
103
+ # Function to run Hierarchical clustering
104
+ def run_hierarchical(X, y_true):
105
+ hierarchical = AgglomerativeClustering(n_clusters=5) # Example: 5 clusters
106
+ y_pred = hierarchical.fit_predict(X)
107
+ n_clusters = hierarchical.n_clusters
108
+ silhouette = silhouette_score(X, y_pred)
109
+ # Check for number of unique labels before calculating Rand Index
110
+ if len(set(y_pred)) > 1:
111
+ rand_index = adjusted_rand_score(y_true, y_pred)
112
+ else:
113
+ rand_index = "N/A (Only one cluster found)"
114
+ return n_clusters, silhouette, rand_index
115
+
116
+ # Function to run DBSCAN clustering
117
+ def run_dbscan(X, y_true):
118
+ dbscan = DBSCAN(eps=0.5, min_samples=5) # Example parameters
119
+ y_pred = dbscan.fit_predict(X)
120
+ n_clusters = len(set(y_pred)) - (1 if -1 in y_pred else 0) # Adjust for noise
121
+ # Check for number of unique labels before calculating Silhouette and Rand Index
122
+ if n_clusters > 1:
123
+ silhouette = silhouette_score(X, y_pred)
124
+ rand_index = adjusted_rand_score(y_true, y_pred)
125
+ else:
126
+ silhouette = "N/A (Only one cluster found)"
127
+ rand_index = "N/A (Only one cluster found)"
128
+ return n_clusters, silhouette, rand_index
129
+
130
+ # Function to run Gaussian Mixture clustering
131
+ def run_gaussian_mixture(X, y_true):
132
+ gaussian_mixture = GaussianMixture(n_components=5, random_state=42) # Example: 5 components
133
+ y_pred = gaussian_mixture.fit_predict(X)
134
+ n_clusters = gaussian_mixture.n_components
135
+ silhouette = silhouette_score(X, y_pred)
136
+ # Check for number of unique labels before calculating Rand Index
137
+ if len(set(y_pred)) > 1:
138
+ rand_index = adjusted_rand_score(y_true, y_pred)
139
+ else:
140
+ rand_index = "N/A (Only one cluster found)"
141
+ return n_clusters, silhouette, rand_index
142
+
143
+ # Main Streamlit app
144
+ def main():
145
+ st.title("Customer Segmentation App")
146
+ # Load data
147
+ df = load_data()
148
+ # Data cleaning and validation
149
+ df = handle_mixed_types(df)
150
+ df = handle_nulls(df)
151
+ df = check_data_types(df)
152
+ df = handle_mixed_types(df)
153
+ # Visualize data
154
+ visualize_data(df)
155
+ # Preprocess data
156
+ X_pca, y_true = preprocess_data_with_pca(df)
157
+ # Create tabs
158
+ tab1, tab2, tab3, tab4 = st.tabs(["K-Means", "Hierarchical", "DBSCAN", "Gaussian Mixture"])
159
+ # Tab 1: K-Means
160
+ with tab1:
161
+ n_clusters, silhouette, rand_index = run_kmeans(X_pca, y_true)
162
+ st.write(f"Number of Clusters: {n_clusters}")
163
+ st.write(f"Silhouette Score: {silhouette:.3f}")
164
+ st.write(f"Rand Index: {rand_index}")
165
+ # Tab 2: Hierarchical
166
+ with tab2:
167
+ n_clusters, silhouette, rand_index = run_hierarchical(X_pca, y_true)
168
+ st.write(f"Number of Clusters: {n_clusters}")
169
+ st.write(f"Silhouette Score: {silhouette:.3f}")
170
+ st.write(f"Rand Index: {rand_index}")
171
+ # Tab 3: DBSCAN
172
+ with tab3:
173
+ n_clusters, silhouette, rand_index = run_dbscan(X_pca, y_true)
174
+ st.write(f"Number of Clusters: {n_clusters}")
175
+ st.write(f"Silhouette Score: {silhouette}")
176
+ st.write(f"Rand Index: {rand_index}")
177
+ # Tab 4: Gaussian Mixture
178
+ with tab4:
179
+ n_clusters, silhouette, rand_index = run_gaussian_mixture(X_pca, y_true)
180
+ st.write(f"Number of Clusters: {n_clusters}")
181
+ st.write(f"Silhouette Score: {silhouette:.3f}")
182
+ st.write(f"Rand Index: {rand_index}")
183
+
184
+ if __name__ == "__main__":
185
+ main()
requirements.txt ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ streamlit
2
+ pandas
3
+ scikit-learn
4
+ matplotlib
5
+ seaborn