|
|
import streamlit as st |
|
|
import pandas as pd |
|
|
from sklearn.cluster import KMeans |
|
|
from sklearn.preprocessing import StandardScaler |
|
|
import seaborn as sns |
|
|
import matplotlib.pyplot as plt |
|
|
from datasets import load_dataset |
|
|
|
|
|
|
|
|
st.set_page_config(page_title="Unsupervised ML: Mall Customer Segmentation", layout="wide") |
|
|
|
|
|
|
|
|
@st.cache_data |
|
|
def load_data(): |
|
|
dataset = load_dataset("kheejay88/mall_customers", split="train") |
|
|
df = pd.DataFrame(dataset) |
|
|
return df |
|
|
|
|
|
df = load_data() |
|
|
|
|
|
|
|
|
@st.cache_data |
|
|
def preprocess_data(df): |
|
|
features = ['Annual Income (k$)', 'Spending Score (1-100)'] |
|
|
X = df[features] |
|
|
scaler = StandardScaler() |
|
|
X_scaled = scaler.fit_transform(X) |
|
|
return X, X_scaled, features, scaler |
|
|
|
|
|
X, X_scaled, features, scaler = preprocess_data(df) |
|
|
|
|
|
|
|
|
@st.cache_data |
|
|
def perform_clustering(X_scaled, n_clusters=5): |
|
|
kmeans = KMeans(n_clusters=n_clusters, random_state=42) |
|
|
clusters = kmeans.fit_predict(X_scaled) |
|
|
return kmeans, clusters |
|
|
|
|
|
kmeans, clusters = perform_clustering(X_scaled) |
|
|
|
|
|
|
|
|
df['Cluster'] = clusters |
|
|
cluster_labels = {i: f'Cluster {i}' for i in range(kmeans.n_clusters)} |
|
|
df['Cluster Label'] = df['Cluster'].map(cluster_labels) |
|
|
|
|
|
|
|
|
st.sidebar.header("User Input Features") |
|
|
annual_income = st.sidebar.slider( |
|
|
'Annual Income (k$)', |
|
|
int(df['Annual Income (k$)'].min()), |
|
|
int(df['Annual Income (k$)'].max()), |
|
|
int(df['Annual Income (k$)'].mean()) |
|
|
) |
|
|
spending_score = st.sidebar.slider( |
|
|
'Spending Score (1-100)', |
|
|
int(df['Spending Score (1-100)'].min()), |
|
|
int(df['Spending Score (1-100)'].max()), |
|
|
int(df['Spending Score (1-100)'].mean()) |
|
|
) |
|
|
|
|
|
|
|
|
def predict_cluster(annual_income, spending_score, kmeans, scaler): |
|
|
input_data = pd.DataFrame([[annual_income, spending_score]], columns=features) |
|
|
input_scaled = scaler.transform(input_data) |
|
|
cluster = kmeans.predict(input_scaled)[0] |
|
|
distances = kmeans.transform(input_scaled)[0] |
|
|
return cluster, distances |
|
|
|
|
|
|
|
|
user_cluster, distances = predict_cluster(annual_income, spending_score, kmeans, scaler) |
|
|
|
|
|
|
|
|
st.title("ποΈ Mall Customer Segmentation App") |
|
|
|
|
|
|
|
|
tab1, tab2, tab3 = st.tabs(["π About", "π Data Visualization", "π Predict Cluster"]) |
|
|
|
|
|
|
|
|
with tab1: |
|
|
st.header("About This App") |
|
|
st.markdown(""" |
|
|
## Overview |
|
|
This app uses **K-Means clustering** to segment mall customers based on their annual income and spending score. |
|
|
|
|
|
## How It Works |
|
|
1. **Data Preprocessing**: |
|
|
- Data is scaled using `StandardScaler` to ensure even distribution. |
|
|
|
|
|
2. **Clustering**: |
|
|
- The K-Means algorithm groups customers into 5 clusters. |
|
|
|
|
|
3. **Prediction**: |
|
|
- Users can provide input values and the app will predict which cluster they belong to. |
|
|
|
|
|
## Dataset Information |
|
|
""") |
|
|
st.dataframe(df.head()) |
|
|
st.markdown(""" |
|
|
The dataset contains **200 samples** of customer data with the following features: |
|
|
- **Annual Income (k$)** |
|
|
- **Spending Score (1-100)** |
|
|
- **Customer ID, Gender, Age** (used for reference) |
|
|
""") |
|
|
|
|
|
|
|
|
with tab2: |
|
|
st.header("Data Visualization") |
|
|
|
|
|
|
|
|
st.subheader("Cluster Distribution") |
|
|
fig, ax = plt.subplots() |
|
|
sns.scatterplot( |
|
|
x=df['Annual Income (k$)'], |
|
|
y=df['Spending Score (1-100)'], |
|
|
hue=df['Cluster Label'], |
|
|
palette='viridis', |
|
|
s=100, |
|
|
alpha=0.7, |
|
|
ax=ax |
|
|
) |
|
|
plt.xlabel('Annual Income (k$)') |
|
|
plt.ylabel('Spending Score (1-100)') |
|
|
plt.title('Customer Segments') |
|
|
st.pyplot(fig) |
|
|
|
|
|
|
|
|
st.subheader("Cluster Centers") |
|
|
cluster_centers = pd.DataFrame( |
|
|
scaler.inverse_transform(kmeans.cluster_centers_), |
|
|
columns=features |
|
|
) |
|
|
cluster_centers['Cluster'] = cluster_labels.values() |
|
|
st.dataframe(cluster_centers) |
|
|
|
|
|
|
|
|
with tab3: |
|
|
st.header("Predict Cluster for Custom Input") |
|
|
|
|
|
|
|
|
st.subheader("User Input:") |
|
|
st.write(f"**Annual Income (k$):** {annual_income}") |
|
|
st.write(f"**Spending Score (1-100):** {spending_score}") |
|
|
|
|
|
|
|
|
st.subheader("Predicted Cluster:") |
|
|
st.write(f"Your input corresponds to **{cluster_labels[user_cluster]}**.") |
|
|
|
|
|
|
|
|
if st.checkbox("Show Cluster Distances"): |
|
|
st.write("**Distance to Each Cluster:**") |
|
|
|
|
|
|
|
|
distance_df = pd.DataFrame(distances, index=[f'Cluster {i}' for i in range(len(distances))], columns=["Distance"]) |
|
|
st.dataframe(distance_df) |
|
|
|
|
|
|
|
|
st.subheader("Distance to Each Cluster (Graph)") |
|
|
|
|
|
fig, ax = plt.subplots() |
|
|
sns.barplot( |
|
|
x=[f'Cluster {i}' for i in range(len(distances))], |
|
|
y=distances, |
|
|
palette='viridis', |
|
|
ax=ax |
|
|
) |
|
|
plt.ylabel("Distance") |
|
|
plt.title("Distance to Each Cluster") |
|
|
st.pyplot(fig) |
|
|
|
|
|
st.markdown(""" |
|
|
**How to Interpret:** |
|
|
- A **lower distance** means the input is closer to that cluster's center. |
|
|
- The predicted cluster will have the smallest distance. |
|
|
""") |
|
|
|
|
|
|
|
|
st.markdown("---") |
|
|
st.write("**By: kheejay**") |