import streamlit as st import pandas as pd import numpy as np import matplotlib.pyplot as plt import seaborn as sns from sklearn.cluster import KMeans from sklearn.preprocessing import StandardScaler, OneHotEncoder from sklearn.decomposition import PCA # Load dataset st.title("Customer Segmentation App") # File uploader dataset_file = st.file_uploader("Upload your CSV file", type=["csv"]) if dataset_file is not None: df = pd.read_csv(dataset_file) st.write("### Preview of Uploaded Data:") st.dataframe(df.head()) # Drop rows with missing values in the entire dataset df.dropna(inplace=True) # Select features for clustering st.write("### Select Features for Clustering") selected_features = st.multiselect("Choose features", df.columns.tolist(), default=df.columns.tolist()) if selected_features: data = df[selected_features] # Identify categorical and numerical features categorical_cols = data.select_dtypes(include=['object']).columns.tolist() numerical_cols = data.select_dtypes(include=['number']).columns.tolist() # Encode categorical features if categorical_cols: encoder = OneHotEncoder(drop='first', sparse_output=False, handle_unknown='ignore') encoded_cats = encoder.fit_transform(data[categorical_cols]) encoded_cats_df = pd.DataFrame(encoded_cats, columns=encoder.get_feature_names_out(categorical_cols)) data = pd.concat([data[numerical_cols].reset_index(drop=True), encoded_cats_df], axis=1) # Standardize data scaler = StandardScaler() scaled_data = scaler.fit_transform(data) # Ensure no NaN values exist after transformations if np.isnan(scaled_data).any(): st.error("Data contains NaN values even after preprocessing. Please check your dataset.") else: # Determine number of clusters using Elbow Method st.write("### Elbow Method for Optimal K") distortions = [] K_range = range(1, 11) for k in K_range: kmeans = KMeans(n_clusters=k, random_state=42, n_init=10) kmeans.fit(scaled_data) distortions.append(kmeans.inertia_) fig, ax = plt.subplots() ax.plot(K_range, distortions, marker='o') ax.set_xlabel('Number of Clusters') ax.set_ylabel('Distortion') ax.set_title('Elbow Method') st.pyplot(fig) # Choose number of clusters k = st.slider("Select Number of Clusters", min_value=2, max_value=10, value=3) # Apply K-Means kmeans = KMeans(n_clusters=k, random_state=42, n_init=10) df['Cluster'] = kmeans.fit_predict(scaled_data) st.write("### Clustered Data") st.dataframe(df.head()) # PCA for visualization pca = PCA(n_components=2) pca_result = pca.fit_transform(scaled_data) df['PCA1'] = pca_result[:, 0] df['PCA2'] = pca_result[:, 1] # Scatter plot of clusters st.write("### Cluster Visualization (PCA)") fig, ax = plt.subplots() sns.scatterplot(x='PCA1', y='PCA2', hue='Cluster', palette='viridis', data=df, ax=ax) ax.set_title("Customer Segmentation (PCA Visualization)") st.pyplot(fig)