File size: 3,524 Bytes
6ba4689
 
 
 
 
 
 
 
 
 
 
f4f0c0c
 
 
 
 
5e451cb
 
f6c40ec
5e451cb
f6c40ec
f4f0c0c
 
f6c40ec
5e451cb
6ba4689
f6c40ec
b5535f7
f6c40ec
f4f0c0c
f6c40ec
f4f0c0c
f6c40ec
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
dfe582d
f4f0c0c
 
 
 
f6c40ec
f4f0c0c
 
 
f6c40ec
f4f0c0c
 
 
 
 
 
 
 
 
f6c40ec
f4f0c0c
 
b5535f7
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
import streamlit as st
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import seaborn as sns

# App title
st.title("🛍️ Customer Segmentation Tool")

# 🎯 Streamlit Tabs
tab1, tab2, tab3 = st.tabs(["📖 About", "📊 Dataset Overview", "🧑‍🤝‍🧑 Customer Segmentation"])

# About Tab
with tab1:
    st.write("""
    This app uses unsupervised learning techniques to segment customers based on their purchasing behavior. 
    The dataset is preloaded and contains online retail data.
    ### How It Works:
    - **Step 1**: Load customer transaction data, including details like Quantity, UnitPrice, and CustomerID.
    - **Step 2**: Process the data by calculating the total spent and aggregating the information by customer.
    - **Step 3**: Apply **K-Means Clustering** to segment the customers into distinct groups.
    - **Step 4**: Visualize the customer segments with a scatter plot.
    """)

# Load preloaded dataset
file_path = "Online Retail.xlsx"
df = pd.read_excel(file_path, sheet_name='Online Retail')

# Dataset Overview Tab
with tab2:
    st.write("### Dataset Overview")
    st.write(df.head())
    
    # Preprocess data
    df = df.dropna(subset=["CustomerID"])  # Remove rows without CustomerID
    df["TotalSpent"] = pd.to_numeric(df["Quantity"], errors='coerce') * pd.to_numeric(df["UnitPrice"], errors='coerce')
    df = df.dropna(subset=["TotalSpent"])
    
    # Aggregate data by Customer
    customer_data = df.groupby("CustomerID").agg({
        "TotalSpent": "sum",
        "Quantity": "sum",
        "UnitPrice": "mean"
    }).rename(columns={"Quantity": "NumTransactions", "UnitPrice": "AvgUnitPrice"})
    
    st.write("### Processed Customer Data")
    st.write(customer_data.head())
    
    # Standardize the data
    scaler = StandardScaler()
    customer_scaled = pd.DataFrame(scaler.fit_transform(customer_data), columns=customer_data.columns, index=customer_data.index)
    
    # Elbow Method to determine optimal clusters
    st.write("### Elbow Method for Optimal Cluster Selection")
    distortions = []
    K = range(1, 11)
    for k in K:
        kmeans = KMeans(n_clusters=k, random_state=42)
        kmeans.fit(customer_scaled)
        distortions.append(kmeans.inertia_)
    
    fig, ax = plt.subplots()
    ax.plot(K, distortions, marker='o')
    ax.set_xlabel("Number of Clusters")
    ax.set_ylabel("Distortion")
    ax.set_title("Elbow Method for Optimal k")
    st.pyplot(fig)

# Customer Segmentation Tab
with tab3:
    # User selects the number of clusters
    num_clusters = st.slider("Select Number of Clusters", min_value=2, max_value=10, value=3)
    
    # Apply K-Means clustering
    model = KMeans(n_clusters=num_clusters, random_state=42)
    customer_data["Cluster"] = model.fit_predict(customer_scaled)
    
    # Visualize the clusters
    st.write("### Clusters Visualization")
    fig, ax = plt.subplots()
    scatter = ax.scatter(customer_data["TotalSpent"], customer_data["NumTransactions"], c=customer_data["Cluster"], cmap='viridis')
    ax.set_xlabel("Total Spent")
    ax.set_ylabel("Number of Transactions")
    ax.set_title("Customer Segments")
    plt.colorbar(scatter, label="Cluster")
    st.pyplot(fig)
    
    # Show the segmented customer data
    st.write("### Customer Segments Data")
    st.write(customer_data.groupby("Cluster").agg({"TotalSpent": "mean", "NumTransactions": "mean", "AvgUnitPrice": "mean"}))