File size: 5,922 Bytes
8cf266f 6315b8e 8cf266f 6315b8e 8cf266f 6315b8e 8cf266f 6315b8e 8cf266f 6315b8e 8cf266f 6315b8e 8cf266f f447860 6315b8e 8cf266f 6315b8e 8cf266f 6315b8e 8cf266f 44c4669 8cf266f 44c4669 8cf266f 44c4669 8cf266f 6315b8e | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 | import streamlit as st
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
import seaborn as sns
import matplotlib.pyplot as plt
from datasets import load_dataset
# Set page configuration
st.set_page_config(page_title="Unsupervised ML: Mall Customer Segmentation", layout="wide")
# --------------------- Load and preprocess the dataset ---------------------
@st.cache_data
def load_data():
dataset = load_dataset("kheejay88/mall_customers", split="train")
df = pd.DataFrame(dataset)
return df
df = load_data()
# Preprocess data
@st.cache_data
def preprocess_data(df):
features = ['Annual Income (k$)', 'Spending Score (1-100)']
X = df[features]
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
return X, X_scaled, features, scaler
X, X_scaled, features, scaler = preprocess_data(df)
# --------------------- Perform K-Means clustering ---------------------
@st.cache_data
def perform_clustering(X_scaled, n_clusters=5):
kmeans = KMeans(n_clusters=n_clusters, random_state=42)
clusters = kmeans.fit_predict(X_scaled)
return kmeans, clusters
kmeans, clusters = perform_clustering(X_scaled)
# Add cluster labels to the dataframe
df['Cluster'] = clusters
cluster_labels = {i: f'Cluster {i}' for i in range(kmeans.n_clusters)}
df['Cluster Label'] = df['Cluster'].map(cluster_labels)
# --------------------- Sidebar for user input ---------------------
st.sidebar.header("User Input Features")
annual_income = st.sidebar.slider(
'Annual Income (k$)',
int(df['Annual Income (k$)'].min()),
int(df['Annual Income (k$)'].max()),
int(df['Annual Income (k$)'].mean())
)
spending_score = st.sidebar.slider(
'Spending Score (1-100)',
int(df['Spending Score (1-100)'].min()),
int(df['Spending Score (1-100)'].max()),
int(df['Spending Score (1-100)'].mean())
)
# --------------------- Predict cluster for user input ---------------------
def predict_cluster(annual_income, spending_score, kmeans, scaler):
input_data = pd.DataFrame([[annual_income, spending_score]], columns=features)
input_scaled = scaler.transform(input_data)
cluster = kmeans.predict(input_scaled)[0]
distances = kmeans.transform(input_scaled)[0]
return cluster, distances
# Prediction
user_cluster, distances = predict_cluster(annual_income, spending_score, kmeans, scaler)
# --------------------- Main panel ---------------------
st.title("ποΈ Mall Customer Segmentation App")
# --------------------- Tabs ---------------------
tab1, tab2, tab3 = st.tabs(["π About", "π Data Visualization", "π Predict Cluster"])
# --------------------- About Tab ---------------------
with tab1:
st.header("About This App")
st.markdown("""
## Overview
This app uses **K-Means clustering** to segment mall customers based on their annual income and spending score.
## How It Works
1. **Data Preprocessing**:
- Data is scaled using `StandardScaler` to ensure even distribution.
2. **Clustering**:
- The K-Means algorithm groups customers into 5 clusters.
3. **Prediction**:
- Users can provide input values and the app will predict which cluster they belong to.
## Dataset Information
""")
st.dataframe(df.head())
st.markdown("""
The dataset contains **200 samples** of customer data with the following features:
- **Annual Income (k$)**
- **Spending Score (1-100)**
- **Customer ID, Gender, Age** (used for reference)
""")
# --------------------- Data Visualization Tab ---------------------
with tab2:
st.header("Data Visualization")
# Cluster distribution plot
st.subheader("Cluster Distribution")
fig, ax = plt.subplots()
sns.scatterplot(
x=df['Annual Income (k$)'],
y=df['Spending Score (1-100)'],
hue=df['Cluster Label'],
palette='viridis',
s=100,
alpha=0.7,
ax=ax
)
plt.xlabel('Annual Income (k$)')
plt.ylabel('Spending Score (1-100)')
plt.title('Customer Segments')
st.pyplot(fig)
# Feature importance (Cluster centers)
st.subheader("Cluster Centers")
cluster_centers = pd.DataFrame(
scaler.inverse_transform(kmeans.cluster_centers_),
columns=features
)
cluster_centers['Cluster'] = cluster_labels.values()
st.dataframe(cluster_centers)
# --------------------- Predict Cluster Tab ---------------------
with tab3:
st.header("Predict Cluster for Custom Input")
# Display user input
st.subheader("User Input:")
st.write(f"**Annual Income (k$):** {annual_income}")
st.write(f"**Spending Score (1-100):** {spending_score}")
# Display predicted cluster
st.subheader("Predicted Cluster:")
st.write(f"Your input corresponds to **{cluster_labels[user_cluster]}**.")
# Show cluster center distances with explanation
if st.checkbox("Show Cluster Distances"):
st.write("**Distance to Each Cluster:**")
# Display distances in a table
distance_df = pd.DataFrame(distances, index=[f'Cluster {i}' for i in range(len(distances))], columns=["Distance"])
st.dataframe(distance_df)
# π Distance bar plot
st.subheader("Distance to Each Cluster (Graph)")
fig, ax = plt.subplots()
sns.barplot(
x=[f'Cluster {i}' for i in range(len(distances))],
y=distances,
palette='viridis',
ax=ax
)
plt.ylabel("Distance")
plt.title("Distance to Each Cluster")
st.pyplot(fig)
st.markdown("""
**How to Interpret:**
- A **lower distance** means the input is closer to that cluster's center.
- The predicted cluster will have the smallest distance.
""")
# --------------------- Footer ---------------------
st.markdown("---")
st.write("**By: kheejay**") |