File size: 3,327 Bytes
0e9897a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
# app.py

import streamlit as st
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
import seaborn as sns

# Streamlit page settings
st.set_page_config(page_title="K-Means Clustering App", page_icon="πŸ€–", layout="wide")

# Title
st.title("πŸ€– K-Means Clustering Explorer")
st.write("This app performs **K-Means Clustering** on a customer segmentation dataset.")

# Load dataset (local file)
@st.cache_data
def load_data():
    data = pd.read_csv("Mall_Customers.csv")  # Make sure this file is in the same folder
    return data

data = load_data()

# Select features
features = data[['Annual Income (k$)', 'Spending Score (1-100)']]

# Sidebar
st.sidebar.header("Settings")
k = st.sidebar.slider("Select number of clusters (K)", 1, 10, 3)

# Perform KMeans clustering
kmeans = KMeans(n_clusters=k, init='k-means++', random_state=42)
clusters = kmeans.fit_predict(features)
data['Cluster'] = clusters

# Calculate Elbow Method data
wcss = []
for i in range(1, 11):
    km = KMeans(n_clusters=i, init='k-means++', random_state=42)
    km.fit(features)
    wcss.append(km.inertia_)

# Analyze clusters
cluster_summary = data.groupby('Cluster')[['Annual Income (k$)', 'Spending Score (1-100)']].mean()

def interpret_cluster(income, spending):
    if income >= 70 and spending >= 50:
        return "πŸ’Ž Premium Customers (High Income, High Spending)"
    elif income <= 40 and spending >= 60:
        return "πŸ”” Potential Risk Customers (Low Income, High Spending)"
    elif income >= 70 and spending <= 40:
        return "πŸ’Ό Careful Spenders (High Income, Low Spending)"
    elif income <= 40 and spending <= 40:
        return "πŸ›’ Budget Customers (Low Income, Low Spending)"
    else:
        return "🧩 Standard Customers"

# Create Tabs
tab1, tab2, tab3, tab4 = st.tabs(["πŸ“„ Raw Dataset", "πŸ“ˆ Elbow Method", "🎯 Clustered Customers", "πŸ“ Cluster Explanations"])

with tab1:
    st.subheader("🧹 Raw Dataset")
    st.dataframe(data.head())

with tab2:
    st.subheader("πŸ“ˆ Elbow Method (to find optimal K)")
    fig, ax = plt.subplots()
    ax.plot(range(1, 11), wcss, marker='o')
    ax.set_xlabel('Number of Clusters (K)')
    ax.set_ylabel('WCSS (Within Cluster Sum of Squares)')
    ax.set_title('The Elbow Method')
    st.pyplot(fig)

with tab3:
    st.subheader("🎯 Clustered Customers")
    fig2, ax2 = plt.subplots()
    palette = sns.color_palette("bright", k)
    sns.scatterplot(
        x='Annual Income (k$)', 
        y='Spending Score (1-100)', 
        hue='Cluster', 
        palette=palette, 
        data=data,
        ax=ax2,
        s=100
    )
    ax2.scatter(
        kmeans.cluster_centers_[:, 0], 
        kmeans.cluster_centers_[:, 1], 
        s=300, 
        c='black', 
        marker='X', 
        label='Centroids'
    )
    ax2.legend()
    ax2.set_title('Customer Segments')
    st.pyplot(fig2)

with tab4:
    st.subheader("πŸ“ Cluster Explanations")
    for cluster_num, row in cluster_summary.iterrows():
        explanation = interpret_cluster(row['Annual Income (k$)'], row['Spending Score (1-100)'])
        st.markdown(f"**Cluster {cluster_num}:** {explanation}")
    st.dataframe(cluster_summary.style.highlight_max(axis=0))

# Footer
st.markdown("---")
st.caption("Made with ❀️ using Streamlit")