mherlie's picture
modified code and added dataset
dd9dd07
import streamlit as st
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.decomposition import PCA
# Load dataset
st.title("Customer Segmentation App")
# File uploader
dataset_file = st.file_uploader("Upload your CSV file", type=["csv"])
if dataset_file is not None:
df = pd.read_csv(dataset_file)
st.write("### Preview of Uploaded Data:")
st.dataframe(df.head())
# Drop rows with missing values in the entire dataset
df.dropna(inplace=True)
# Select features for clustering
st.write("### Select Features for Clustering")
selected_features = st.multiselect("Choose features", df.columns.tolist(), default=df.columns.tolist())
if selected_features:
data = df[selected_features]
# Identify categorical and numerical features
categorical_cols = data.select_dtypes(include=['object']).columns.tolist()
numerical_cols = data.select_dtypes(include=['number']).columns.tolist()
# Encode categorical features
if categorical_cols:
encoder = OneHotEncoder(drop='first', sparse_output=False, handle_unknown='ignore')
encoded_cats = encoder.fit_transform(data[categorical_cols])
encoded_cats_df = pd.DataFrame(encoded_cats, columns=encoder.get_feature_names_out(categorical_cols))
data = pd.concat([data[numerical_cols].reset_index(drop=True), encoded_cats_df], axis=1)
# Standardize data
scaler = StandardScaler()
scaled_data = scaler.fit_transform(data)
# Ensure no NaN values exist after transformations
if np.isnan(scaled_data).any():
st.error("Data contains NaN values even after preprocessing. Please check your dataset.")
else:
# Determine number of clusters using Elbow Method
st.write("### Elbow Method for Optimal K")
distortions = []
K_range = range(1, 11)
for k in K_range:
kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
kmeans.fit(scaled_data)
distortions.append(kmeans.inertia_)
fig, ax = plt.subplots()
ax.plot(K_range, distortions, marker='o')
ax.set_xlabel('Number of Clusters')
ax.set_ylabel('Distortion')
ax.set_title('Elbow Method')
st.pyplot(fig)
# Choose number of clusters
k = st.slider("Select Number of Clusters", min_value=2, max_value=10, value=3)
# Apply K-Means
kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
df['Cluster'] = kmeans.fit_predict(scaled_data)
st.write("### Clustered Data")
st.dataframe(df.head())
# PCA for visualization
pca = PCA(n_components=2)
pca_result = pca.fit_transform(scaled_data)
df['PCA1'] = pca_result[:, 0]
df['PCA2'] = pca_result[:, 1]
# Scatter plot of clusters
st.write("### Cluster Visualization (PCA)")
fig, ax = plt.subplots()
sns.scatterplot(x='PCA1', y='PCA2', hue='Cluster', palette='viridis', data=df, ax=ax)
ax.set_title("Customer Segmentation (PCA Visualization)")
st.pyplot(fig)