Spaces:
Sleeping
Sleeping
| import streamlit as st | |
| import pandas as pd | |
| import numpy as np | |
| import matplotlib.pyplot as plt | |
| import seaborn as sns | |
| from sklearn.cluster import KMeans | |
| from sklearn.preprocessing import StandardScaler, OneHotEncoder | |
| from sklearn.decomposition import PCA | |
| # Load dataset | |
| st.title("Customer Segmentation App") | |
| # File uploader | |
| dataset_file = st.file_uploader("Upload your CSV file", type=["csv"]) | |
| if dataset_file is not None: | |
| df = pd.read_csv(dataset_file) | |
| st.write("### Preview of Uploaded Data:") | |
| st.dataframe(df.head()) | |
| # Drop rows with missing values in the entire dataset | |
| df.dropna(inplace=True) | |
| # Select features for clustering | |
| st.write("### Select Features for Clustering") | |
| selected_features = st.multiselect("Choose features", df.columns.tolist(), default=df.columns.tolist()) | |
| if selected_features: | |
| data = df[selected_features] | |
| # Identify categorical and numerical features | |
| categorical_cols = data.select_dtypes(include=['object']).columns.tolist() | |
| numerical_cols = data.select_dtypes(include=['number']).columns.tolist() | |
| # Encode categorical features | |
| if categorical_cols: | |
| encoder = OneHotEncoder(drop='first', sparse_output=False, handle_unknown='ignore') | |
| encoded_cats = encoder.fit_transform(data[categorical_cols]) | |
| encoded_cats_df = pd.DataFrame(encoded_cats, columns=encoder.get_feature_names_out(categorical_cols)) | |
| data = pd.concat([data[numerical_cols].reset_index(drop=True), encoded_cats_df], axis=1) | |
| # Standardize data | |
| scaler = StandardScaler() | |
| scaled_data = scaler.fit_transform(data) | |
| # Ensure no NaN values exist after transformations | |
| if np.isnan(scaled_data).any(): | |
| st.error("Data contains NaN values even after preprocessing. Please check your dataset.") | |
| else: | |
| # Determine number of clusters using Elbow Method | |
| st.write("### Elbow Method for Optimal K") | |
| distortions = [] | |
| K_range = range(1, 11) | |
| for k in K_range: | |
| kmeans = KMeans(n_clusters=k, random_state=42, n_init=10) | |
| kmeans.fit(scaled_data) | |
| distortions.append(kmeans.inertia_) | |
| fig, ax = plt.subplots() | |
| ax.plot(K_range, distortions, marker='o') | |
| ax.set_xlabel('Number of Clusters') | |
| ax.set_ylabel('Distortion') | |
| ax.set_title('Elbow Method') | |
| st.pyplot(fig) | |
| # Choose number of clusters | |
| k = st.slider("Select Number of Clusters", min_value=2, max_value=10, value=3) | |
| # Apply K-Means | |
| kmeans = KMeans(n_clusters=k, random_state=42, n_init=10) | |
| df['Cluster'] = kmeans.fit_predict(scaled_data) | |
| st.write("### Clustered Data") | |
| st.dataframe(df.head()) | |
| # PCA for visualization | |
| pca = PCA(n_components=2) | |
| pca_result = pca.fit_transform(scaled_data) | |
| df['PCA1'] = pca_result[:, 0] | |
| df['PCA2'] = pca_result[:, 1] | |
| # Scatter plot of clusters | |
| st.write("### Cluster Visualization (PCA)") | |
| fig, ax = plt.subplots() | |
| sns.scatterplot(x='PCA1', y='PCA2', hue='Cluster', palette='viridis', data=df, ax=ax) | |
| ax.set_title("Customer Segmentation (PCA Visualization)") | |
| st.pyplot(fig) |