File size: 5,922 Bytes
8cf266f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6315b8e
 
8cf266f
 
6315b8e
8cf266f
 
 
 
 
 
 
 
 
 
 
 
6315b8e
 
8cf266f
6315b8e
 
8cf266f
6315b8e
 
8cf266f
6315b8e
 
 
8cf266f
f447860
 
 
6315b8e
 
 
 
8cf266f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6315b8e
8cf266f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6315b8e
8cf266f
44c4669
8cf266f
 
 
44c4669
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8cf266f
 
44c4669
 
 
8cf266f
 
 
 
6315b8e
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
import streamlit as st
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
import seaborn as sns
import matplotlib.pyplot as plt
from datasets import load_dataset

# Set page configuration
st.set_page_config(page_title="Unsupervised ML: Mall Customer Segmentation", layout="wide")

# --------------------- Load and preprocess the dataset ---------------------
@st.cache_data
def load_data():
    dataset = load_dataset("kheejay88/mall_customers", split="train")
    df = pd.DataFrame(dataset)
    return df

df = load_data()

# Preprocess data
@st.cache_data
def preprocess_data(df):
    features = ['Annual Income (k$)', 'Spending Score (1-100)']
    X = df[features]
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    return X, X_scaled, features, scaler

X, X_scaled, features, scaler = preprocess_data(df)

# --------------------- Perform K-Means clustering ---------------------
@st.cache_data
def perform_clustering(X_scaled, n_clusters=5):
    kmeans = KMeans(n_clusters=n_clusters, random_state=42)
    clusters = kmeans.fit_predict(X_scaled)
    return kmeans, clusters

kmeans, clusters = perform_clustering(X_scaled)

# Add cluster labels to the dataframe
df['Cluster'] = clusters
cluster_labels = {i: f'Cluster {i}' for i in range(kmeans.n_clusters)}
df['Cluster Label'] = df['Cluster'].map(cluster_labels)

# --------------------- Sidebar for user input ---------------------
st.sidebar.header("User Input Features")
annual_income = st.sidebar.slider(
    'Annual Income (k$)',
    int(df['Annual Income (k$)'].min()),
    int(df['Annual Income (k$)'].max()),
    int(df['Annual Income (k$)'].mean())
)
spending_score = st.sidebar.slider(
    'Spending Score (1-100)',
    int(df['Spending Score (1-100)'].min()),
    int(df['Spending Score (1-100)'].max()),
    int(df['Spending Score (1-100)'].mean())
)

# --------------------- Predict cluster for user input ---------------------
def predict_cluster(annual_income, spending_score, kmeans, scaler):
    input_data = pd.DataFrame([[annual_income, spending_score]], columns=features)
    input_scaled = scaler.transform(input_data)
    cluster = kmeans.predict(input_scaled)[0]
    distances = kmeans.transform(input_scaled)[0]
    return cluster, distances

# Prediction
user_cluster, distances = predict_cluster(annual_income, spending_score, kmeans, scaler)

# --------------------- Main panel ---------------------
st.title("πŸ›οΈ Mall Customer Segmentation App")

# --------------------- Tabs ---------------------
tab1, tab2, tab3 = st.tabs(["🏠 About", "πŸ“Š Data Visualization", "πŸ”Ž Predict Cluster"])

# --------------------- About Tab ---------------------
with tab1:
    st.header("About This App")
    st.markdown("""
    ## Overview
    This app uses **K-Means clustering** to segment mall customers based on their annual income and spending score.
    
    ## How It Works
    1. **Data Preprocessing**: 
       - Data is scaled using `StandardScaler` to ensure even distribution.

    2. **Clustering**:
       - The K-Means algorithm groups customers into 5 clusters.

    3. **Prediction**:
       - Users can provide input values and the app will predict which cluster they belong to.
    
    ## Dataset Information
    """)
    st.dataframe(df.head())
    st.markdown("""
    The dataset contains **200 samples** of customer data with the following features:
    - **Annual Income (k$)**
    - **Spending Score (1-100)**
    - **Customer ID, Gender, Age** (used for reference)
    """)

# --------------------- Data Visualization Tab ---------------------
with tab2:
    st.header("Data Visualization")

    # Cluster distribution plot
    st.subheader("Cluster Distribution")
    fig, ax = plt.subplots()
    sns.scatterplot(
        x=df['Annual Income (k$)'],
        y=df['Spending Score (1-100)'],
        hue=df['Cluster Label'],
        palette='viridis',
        s=100,
        alpha=0.7,
        ax=ax
    )
    plt.xlabel('Annual Income (k$)')
    plt.ylabel('Spending Score (1-100)')
    plt.title('Customer Segments')
    st.pyplot(fig)

    # Feature importance (Cluster centers)
    st.subheader("Cluster Centers")
    cluster_centers = pd.DataFrame(
        scaler.inverse_transform(kmeans.cluster_centers_),
        columns=features
    )
    cluster_centers['Cluster'] = cluster_labels.values()
    st.dataframe(cluster_centers)

# --------------------- Predict Cluster Tab ---------------------
with tab3:
    st.header("Predict Cluster for Custom Input")

    # Display user input
    st.subheader("User Input:")
    st.write(f"**Annual Income (k$):** {annual_income}")
    st.write(f"**Spending Score (1-100):** {spending_score}")

    # Display predicted cluster
    st.subheader("Predicted Cluster:")
    st.write(f"Your input corresponds to **{cluster_labels[user_cluster]}**.")

    # Show cluster center distances with explanation
    if st.checkbox("Show Cluster Distances"):
        st.write("**Distance to Each Cluster:**")

        # Display distances in a table
        distance_df = pd.DataFrame(distances, index=[f'Cluster {i}' for i in range(len(distances))], columns=["Distance"])
        st.dataframe(distance_df)

        # πŸ“Š Distance bar plot
        st.subheader("Distance to Each Cluster (Graph)")

        fig, ax = plt.subplots()
        sns.barplot(
            x=[f'Cluster {i}' for i in range(len(distances))],
            y=distances,
            palette='viridis',
            ax=ax
        )
        plt.ylabel("Distance")
        plt.title("Distance to Each Cluster")
        st.pyplot(fig)

        st.markdown("""
        **How to Interpret:**
        - A **lower distance** means the input is closer to that cluster's center.
        - The predicted cluster will have the smallest distance.
        """)

# --------------------- Footer ---------------------
st.markdown("---")
st.write("**By: kheejay**")