Spaces:
Sleeping
Sleeping
| import streamlit as st | |
| import pandas as pd | |
| from sklearn.preprocessing import StandardScaler, MinMaxScaler | |
| from sklearn.decomposition import PCA | |
| from sklearn.cluster import KMeans, AgglomerativeClustering, DBSCAN | |
| from sklearn.mixture import GaussianMixture | |
| from sklearn.metrics import silhouette_score, adjusted_rand_score | |
| from sklearn.ensemble import RandomForestClassifier | |
| import matplotlib.pyplot as plt | |
| import seaborn as sns | |
| import io | |
| import numpy as np | |
| # Function to load the dataset with st.spinner | |
| # Cache the data to speed up subsequent runs | |
| def load_data(): | |
| with st.spinner("Loading data..."): | |
| df = pd.read_csv("marketing_campaign.csv", delimiter='\t') | |
| return df | |
| def handle_mixed_types(df): | |
| for col in df.columns: | |
| unique_types = df[col].apply(type).unique() | |
| if len(unique_types) > 1: # Check if there are mixed types | |
| # If mixed numeric types (int and float), convert to float | |
| if all(issubclass(t, (int, float)) for t in unique_types): | |
| df[col] = df[col].astype(float) | |
| # Otherwise, convert to string (e.g., for mixed numeric and string types) | |
| else: | |
| df[col] = df[col].astype(str) | |
| return df | |
| def handle_nulls(df): | |
| for col in df.columns: | |
| if df[col].dtype == 'object': | |
| df[col] = df[col].fillna(df[col].mode()[0]) # Explicit assignment for categorical | |
| else: | |
| df[col] = df[col].fillna(df[col].mean()) # Explicit assignment for numerical | |
| return df | |
| # Function to check data type consistency | |
| def check_data_types(df): | |
| df['Dt_Customer'] = pd.to_datetime(df['Dt_Customer'], dayfirst=True) | |
| return df | |
| # Function to detect and remove outliers based on income | |
| def remove_outliers(df): | |
| Q1 = df['Income'].quantile(0.25) | |
| Q3 = df['Income'].quantile(0.75) | |
| IQR = Q3 - Q1 | |
| lower_bound = Q1 - 1.5 * IQR | |
| upper_bound = Q3 + 1.5 * IQR | |
| df = df[(df['Income'] >= lower_bound) & (df['Income'] <= upper_bound)] | |
| return df | |
| # Function to visualize data distribution | |
| def visualize_data(df): | |
| st.subheader("Data Visualization") | |
| # Select top 3 columns with highest variance (excluding date and object types) | |
| numerical_df = df.select_dtypes(exclude=['object', 'datetime']) # Exclude datetime columns | |
| top_3_cols = numerical_df.var().sort_values(ascending=False).head(3).index.tolist() | |
| for col in top_3_cols: | |
| if df[col].dtype == 'object': | |
| plt.figure(figsize=(10, 5)) | |
| sns.countplot(x=col, data=df) | |
| plt.xticks(rotation=45) | |
| # Convert plot to image | |
| img = io.BytesIO() | |
| plt.savefig(img, format='png') | |
| img.seek(0) | |
| st.image(img) # Display the image | |
| else: | |
| plt.figure(figsize=(10, 5)) | |
| sns.histplot(x=col, data=df, kde=True) | |
| # Convert plot to image | |
| img = io.BytesIO() | |
| plt.savefig(img, format='png') | |
| img.seek(0) | |
| st.image(img) # Display the image | |
| plt.figure(figsize=(10, 5)) | |
| sns.histplot(x=df["Response"], data=df, kde=True) | |
| # Convert plot to image | |
| img = io.BytesIO() | |
| plt.savefig(img, format='png') | |
| img.seek(0) | |
| st.image(img) # Display the image | |
| # Preprocess data with PCA to exclude columns that do not contribute to clustering | |
| def preprocess_data_with_pca_exclusion(df): | |
| categorical_cols = df.select_dtypes(include=['object']).columns.tolist() | |
| df_encoded = pd.get_dummies(df, columns=categorical_cols) | |
| X = df_encoded.drop(columns=['Response']) | |
| X['Dt_Customer_Year'] = X['Dt_Customer'].dt.year | |
| X['Dt_Customer_Month'] = X['Dt_Customer'].dt.month | |
| X['Dt_Customer_Day'] = X['Dt_Customer'].dt.day | |
| X = X.drop(columns=['Dt_Customer']) | |
| # MinMax scale numerical features | |
| scaler = MinMaxScaler() | |
| numerical_cols = X.select_dtypes(include=['number']).columns.tolist() | |
| X[numerical_cols] = scaler.fit_transform(X[numerical_cols]) | |
| # Feature importance analysis using Random Forest | |
| y = df['Response'] | |
| model = RandomForestClassifier(n_estimators=100, random_state=42) | |
| model.fit(X, y) | |
| feature_importances = model.feature_importances_ | |
| important_features = np.argsort(feature_importances)[-int(0.5 * len(feature_importances)):] # Retain top 50% features | |
| # Create a new dataframe with only the important features | |
| X_important = X.iloc[:, important_features] | |
| # Apply PCA to retain components that explain 90% of the variance | |
| pca = PCA(n_components=0.90) | |
| X_pca = pca.fit_transform(X_important) | |
| # Get the columns that contribute to the PCA components | |
| pca_columns = pca.components_.argsort()[:, -1:-X_pca.shape[1]-1:-1] | |
| # Get the original column names that contribute to the PCA components | |
| contributing_columns = [X_important.columns[i] for i in pca_columns.flatten()] | |
| # Drop duplicate columns and keep only those that contribute to the PCA components | |
| contributing_columns = list(dict.fromkeys(contributing_columns)) | |
| # Create a new dataframe with only the contributing columns | |
| X_contributing = X_important[contributing_columns] | |
| return X_contributing, df['Response'] | |
| # Function to run K-Means clustering | |
| def run_kmeans(X, y_true): | |
| kmeans = KMeans(n_clusters=2, random_state=42) # Example: 5 clusters | |
| y_pred = kmeans.fit_predict(X) | |
| n_clusters = kmeans.n_clusters | |
| silhouette = silhouette_score(X, y_pred) | |
| # Check for number of unique labels before calculating Rand Index | |
| if len(set(y_pred)) > 1: | |
| rand_index = adjusted_rand_score(y_true, y_pred) | |
| else: | |
| rand_index = "N/A (Only one cluster found)" | |
| return n_clusters, silhouette, rand_index | |
| # Function to run Hierarchical clustering | |
| def run_hierarchical(X, y_true): | |
| hierarchical = AgglomerativeClustering(n_clusters=2) | |
| y_pred = hierarchical.fit_predict(X) | |
| n_clusters = hierarchical.n_clusters | |
| silhouette = silhouette_score(X, y_pred) | |
| # Check for number of unique labels before calculating Rand Index | |
| if len(set(y_pred)) > 1: | |
| rand_index = adjusted_rand_score(y_true, y_pred) | |
| else: | |
| rand_index = "N/A (Only one cluster found)" | |
| return n_clusters, silhouette, rand_index | |
| # Function to run DBSCAN clustering | |
| def run_dbscan(X, y_true): | |
| dbscan = DBSCAN(eps=1.0, min_samples=6) | |
| y_pred = dbscan.fit_predict(X) | |
| n_clusters = len(set(y_pred)) - (1 if -1 in y_pred else 0) # Adjust for noise | |
| # Check for number of unique labels before calculating Silhouette and Rand Index | |
| if n_clusters > 1: | |
| silhouette = silhouette_score(X, y_pred) | |
| rand_index = adjusted_rand_score(y_true, y_pred) | |
| else: | |
| silhouette = "N/A (Only one cluster found)" | |
| rand_index = "N/A (Only one cluster found)" | |
| return n_clusters, silhouette, rand_index | |
| # Function to run Gaussian Mixture clustering | |
| def run_gaussian_mixture(X, y_true): | |
| gaussian_mixture = GaussianMixture(n_components=2, random_state=42) # Example: 5 components | |
| y_pred = gaussian_mixture.fit_predict(X) | |
| n_clusters = gaussian_mixture.n_components | |
| silhouette = silhouette_score(X, y_pred) | |
| # Check for number of unique labels before calculating Rand Index | |
| if len(set(y_pred)) > 1: | |
| rand_index = adjusted_rand_score(y_true, y_pred) | |
| else: | |
| rand_index = "N/A (Only one cluster found)" | |
| return n_clusters, silhouette, rand_index | |
| # Main Streamlit app | |
| def main(): | |
| st.title("Customer Segmentation Analysis App") | |
| with st.expander("About this App"): | |
| st.markdown(""" | |
| ## About this App | |
| This app is designed to analyze customer data from a marketing campaign and determine customer segmentation using k-means clustering and other machine learning models. | |
| ### Dataset Information | |
| - **Number of Records:** 2,240 | |
| - **Number of Features:** 29 | |
| ### Features: | |
| - **ID:** Unique identifier for each customer. | |
| - **Year_Birth:** Year of birth of the customer. | |
| - **Education:** Education level of the customer. | |
| - **Marital_Status:** Marital status of the customer. | |
| - **Income:** Annual income of the customer. | |
| - **Kidhome:** Number of small children in customer's household. | |
| - **Teenhome:** Number of teenagers in customer's household. | |
| - **Dt_Customer:** Date of customer's enrollment with the company. | |
| - **Recency:** Number of days since the customer's last purchase. | |
| - **MntWines:** Amount spent on wine in the last 2 years. | |
| - **MntFruits:** Amount spent on fruits in the last 2 years. | |
| - **MntMeatProducts:** Amount spent on meat products in the last 2 years. | |
| - **MntFishProducts:** Amount spent on fish products in the last 2 years. | |
| - **MntSweetProducts:** Amount spent on sweet products in the last 2 years. | |
| - **MntGoldProds:** Amount spent on gold products in the last 2 years. | |
| - **NumDealsPurchases:** Number of purchases made with a discount. | |
| - **NumWebPurchases:** Number of purchases made through the company's website. | |
| - **NumCatalogPurchases:** Number of purchases made using a catalogue. | |
| - **NumStorePurchases:** Number of purchases made directly in stores. | |
| - **NumWebVisitsMonth:** Number of visits to company's website in the last month. | |
| - **AcceptedCmp3, AcceptedCmp4, AcceptedCmp5, AcceptedCmp1, AcceptedCmp2:** 1 if customer accepted the offer in the respective campaign, 0 otherwise. | |
| - **Complain:** 1 if customer complained in the last 2 years, 0 otherwise. | |
| - **Z_CostContact, Z_Revenue:** Internal features related to cost and revenue. | |
| - **Response:** 1 if customer accepted the offer in the last campaign, 0 otherwise. | |
| ### Analysis | |
| The app will use k-means clustering and other machine learning models to determine customer segmentation based on their purchasing behavior and demographic information. This will help in identifying distinct groups of customers and tailoring marketing strategies accordingly. | |
| Created by: Louie F. Cervantes, M.Eng. (Information Engineering) | |
| """) | |
| # Load data | |
| df = load_data() | |
| # Data cleaning and validation | |
| #df = handle_mixed_types(df) | |
| df = handle_nulls(df) | |
| df = check_data_types(df) | |
| df = handle_mixed_types(df) | |
| # Remove outliers based on income | |
| df = remove_outliers(df) | |
| # Visualize data | |
| visualize_data(df) | |
| # Preprocess data | |
| X_contributing, y_true = preprocess_data_with_pca_exclusion(df) | |
| st.write("Optimizing Clustering...") | |
| st.write("Columns contributing to clustering:") | |
| st.write(X_contributing.columns.tolist()) | |
| st.write("\nFirst few rows of the preprocessed data:") | |
| st.write(X_contributing.head()) | |
| # Evaluate the optimal number of clusters using silhouette score | |
| silhouette_scores = [] | |
| for n_clusters in range(2, 11): | |
| kmeans = KMeans(n_clusters=n_clusters, random_state=42) | |
| y_pred = kmeans.fit_predict(X_contributing) | |
| silhouette_scores.append(silhouette_score(X_contributing, y_pred)) | |
| optimal_n_clusters = range(2, 11)[silhouette_scores.index(max(silhouette_scores))] | |
| optimal_silhouette_score = max(silhouette_scores) | |
| st.write(f"Optimal number of clusters: {optimal_n_clusters}") | |
| st.write(f"Optimal silhouette score: {optimal_silhouette_score}") | |
| # Evaluate the explained variance ratio for PCA components | |
| explained_variance_ratio = PCA(n_components=0.95).fit(X_contributing).explained_variance_ratio_ | |
| st.write(f"Explained variance ratio for PCA components: {explained_variance_ratio}") | |
| # Evaluate the scaling method (MinMaxScaler vs StandardScaler) | |
| scalers = [MinMaxScaler(), StandardScaler()] | |
| scaler_names = ['MinMaxScaler', 'StandardScaler'] | |
| scaler_scores = [] | |
| for scaler, name in zip(scalers, scaler_names): | |
| X_scaled = scaler.fit_transform(X_contributing) | |
| kmeans = KMeans(n_clusters=optimal_n_clusters, random_state=42) | |
| y_pred = kmeans.fit_predict(X_scaled) | |
| score = silhouette_score(X_scaled, y_pred) | |
| scaler_scores.append((name, score)) | |
| best_scaler_name, best_scaler_score = max(scaler_scores, key=lambda x: x[1]) | |
| st.write(f"Best scaling method: {best_scaler_name}") | |
| st.write(f"Best silhouette score with scaling: {best_scaler_score}") | |
| # Define the parameter grid for DBSCAN | |
| param_grid = { | |
| 'eps': np.arange(0.1, 1.1, 0.1), | |
| 'min_samples': np.arange(2, 11, 1) | |
| } | |
| # Initialize DBSCAN model | |
| dbscan_model = DBSCAN() | |
| # Perform grid search with silhouette score as the evaluation metric | |
| best_score = -1 | |
| best_params = None | |
| for eps in param_grid['eps']: | |
| for min_samples in param_grid['min_samples']: | |
| dbscan_model.set_params(eps=eps, min_samples=min_samples) | |
| y_pred = dbscan_model.fit_predict(X_contributing) | |
| n_clusters = len(set(y_pred)) - (1 if -1 in y_pred else 0) | |
| if n_clusters > 1: | |
| silhouette = silhouette_score(X_contributing, y_pred) | |
| if silhouette > best_score: | |
| best_score = silhouette | |
| best_params = {'eps': eps, 'min_samples': min_samples} | |
| st.write(f"Best parameters: {best_params}") | |
| st.write(f"Best silhouette score: {best_score}") | |
| # Run DBSCAN with the best parameters | |
| dbscan_model.set_params(**best_params) | |
| y_pred_best = dbscan_model.fit_predict(X_contributing) | |
| n_clusters_best = len(set(y_pred_best)) - (1 if -1 in y_pred_best else 0) | |
| if n_clusters_best > 1: | |
| silhouette_best = silhouette_score(X_contributing, y_pred_best) | |
| rand_index_best = adjusted_rand_score(y_true, y_pred_best) | |
| else: | |
| silhouette_best = "N/A (Only one cluster found)" | |
| rand_index_best = "N/A (Only one cluster found)" | |
| st.write(f"Number of Clusters: {n_clusters_best}") | |
| st.write(f"Silhouette Score: {silhouette_best}") | |
| st.write(f"Rand Index: {rand_index_best}") | |
| # Create tabs | |
| tab1, tab2, tab3, tab4 = st.tabs(["K-Means", "Hierarchical", "DBSCAN", "Gaussian Mixture"]) | |
| # Tab 1: K-Means | |
| with tab1: | |
| n_clusters, silhouette, rand_index = run_kmeans(X_contributing, y_true) | |
| st.write(f"Number of Clusters: {n_clusters}") | |
| st.write(f"Silhouette Score: {silhouette:.3f}") | |
| st.write(f"Rand Index: {rand_index}") | |
| # Tab 2: Hierarchical | |
| with tab2: | |
| n_clusters, silhouette, rand_index = run_hierarchical(X_contributing, y_true) | |
| st.write(f"Number of Clusters: {n_clusters}") | |
| st.write(f"Silhouette Score: {silhouette:.3f}") | |
| st.write(f"Rand Index: {rand_index}") | |
| # Tab 3: DBSCAN | |
| with tab3: | |
| n_clusters, silhouette, rand_index = run_dbscan(X_contributing, y_true) | |
| st.write(f"Number of Clusters: {n_clusters}") | |
| st.write(f"Silhouette Score: {silhouette}") | |
| st.write(f"Rand Index: {rand_index}") | |
| # Tab 4: Gaussian Mixture | |
| with tab4: | |
| n_clusters, silhouette, rand_index = run_gaussian_mixture(X_contributing, y_true) | |
| st.write(f"Number of Clusters: {n_clusters}") | |
| st.write(f"Silhouette Score: {silhouette:.3f}") | |
| st.write(f"Rand Index: {rand_index}") | |
| st.header("Predict Customer Cluster") | |
| # Create a form for user input | |
| with st.form(key='customer_form'): | |
| year_birth = st.number_input('Year of Birth', min_value=1900, max_value=2023, value=1980) | |
| education = st.selectbox('Education Level', df['Education'].unique()) | |
| marital_status = st.selectbox('Marital Status', df['Marital_Status'].unique()) | |
| income = st.number_input('Annual Income', min_value=0, value=50000) | |
| kidhome = st.number_input('Number of Small Children', min_value=0, max_value=10, value=0) | |
| teenhome = st.number_input('Number of Teenagers', min_value=0, max_value=10, value=0) | |
| recency = st.number_input('Recency (days since last purchase)', min_value=0, value=30) | |
| mnt_wines = st.number_input('Amount Spent on Wine', min_value=0, value=100) | |
| mnt_fruits = st.number_input('Amount Spent on Fruits', min_value=0, value=50) | |
| mnt_meat_products = st.number_input('Amount Spent on Meat Products', min_value=0, value=200) | |
| mnt_fish_products = st.number_input('Amount Spent on Fish Products', min_value=0, value=50) | |
| mnt_sweet_products = st.number_input('Amount Spent on Sweet Products', min_value=0, value=50) | |
| mnt_gold_prods = st.number_input('Amount Spent on Gold Products', min_value=0, value=100) | |
| num_deals_purchases = st.number_input('Number of Purchases with Discount', min_value=0, value=5) | |
| num_web_purchases = st.number_input('Number of Web Purchases', min_value=0, value=5) | |
| num_catalog_purchases = st.number_input('Number of Catalog Purchases', min_value=0, value=5) | |
| num_store_purchases = st.number_input('Number of Store Purchases', min_value=0, value=5) | |
| num_web_visits_month = st.number_input('Number of Web Visits per Month', min_value=0, value=5) | |
| submit_button = st.form_submit_button(label='Predict Cluster') | |
| # Predict cluster when form is submitted | |
| if submit_button: | |
| # Create a dataframe from user input | |
| user_data = pd.DataFrame({ | |
| 'Year_Birth': [year_birth], | |
| 'Education': [education], | |
| 'Marital_Status': [marital_status], | |
| 'Income': [income], | |
| 'Kidhome': [kidhome], | |
| 'Teenhome': [teenhome], | |
| 'Recency': [recency], | |
| 'MntWines': [mnt_wines], | |
| 'MntFruits': [mnt_fruits], | |
| 'MntMeatProducts': [mnt_meat_products], | |
| 'MntFishProducts': [mnt_fish_products], | |
| 'MntSweetProducts': [mnt_sweet_products], | |
| 'MntGoldProds': [mnt_gold_prods], | |
| 'NumDealsPurchases': [num_deals_purchases], | |
| 'NumWebPurchases': [num_web_purchases], | |
| 'NumCatalogPurchases': [num_catalog_purchases], | |
| 'NumStorePurchases': [num_store_purchases], | |
| 'NumWebVisitsMonth': [num_web_visits_month] | |
| }) | |
| # One-hot encode user input | |
| user_data_encoded = pd.get_dummies(user_data, columns=['Education', 'Marital_Status']) | |
| # Align the encoded user data with the training data | |
| user_data_encoded = user_data_encoded.reindex(columns=X_contributing.columns, fill_value=0) | |
| # Standardize the user input | |
| user_data_scaled = scaler.transform(user_data_encoded) | |
| # Predict the cluster | |
| cluster = kmeans.predict(user_data_scaled) | |
| st.write(f'The predicted customer cluster is: {cluster[0]}') | |
| st.markdown("""(c) 2025 West Visayas State University - College of Information and Communications Technology""") | |
| if __name__ == "__main__": | |
| main() |