Spaces:

chiichann
/

customer_segmentation_tool

Sleeping

File size: 3,524 Bytes

6ba4689
 
 
 
 
 
 
 
 
 
 
f4f0c0c
 
 
 
 
5e451cb
 
f6c40ec
5e451cb
f6c40ec
f4f0c0c
 
f6c40ec
5e451cb
6ba4689
f6c40ec
b5535f7
f6c40ec
f4f0c0c
f6c40ec
f4f0c0c
f6c40ec
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
dfe582d
f4f0c0c
 
 
 
f6c40ec
f4f0c0c
 
 
f6c40ec
f4f0c0c
 
 
 
 
 
 
 
 
f6c40ec
f4f0c0c
 
b5535f7

import streamlit as st
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import seaborn as sns

# App title
st.title("🛍️ Customer Segmentation Tool")

# 🎯 Streamlit Tabs
tab1, tab2, tab3 = st.tabs(["📖 About", "📊 Dataset Overview", "🧑‍🤝‍🧑 Customer Segmentation"])

# About Tab
with tab1:
    st.write("""
    This app uses unsupervised learning techniques to segment customers based on their purchasing behavior. 
    The dataset is preloaded and contains online retail data.
    ### How It Works:
    - **Step 1**: Load customer transaction data, including details like Quantity, UnitPrice, and CustomerID.
    - **Step 2**: Process the data by calculating the total spent and aggregating the information by customer.
    - **Step 3**: Apply **K-Means Clustering** to segment the customers into distinct groups.
    - **Step 4**: Visualize the customer segments with a scatter plot.
    """)

# Load preloaded dataset
file_path = "Online Retail.xlsx"
df = pd.read_excel(file_path, sheet_name='Online Retail')

# Dataset Overview Tab
with tab2:
    st.write("### Dataset Overview")
    st.write(df.head())
    
    # Preprocess data
    df = df.dropna(subset=["CustomerID"])  # Remove rows without CustomerID
    df["TotalSpent"] = pd.to_numeric(df["Quantity"], errors='coerce') * pd.to_numeric(df["UnitPrice"], errors='coerce')
    df = df.dropna(subset=["TotalSpent"])
    
    # Aggregate data by Customer
    customer_data = df.groupby("CustomerID").agg({
        "TotalSpent": "sum",
        "Quantity": "sum",
        "UnitPrice": "mean"
    }).rename(columns={"Quantity": "NumTransactions", "UnitPrice": "AvgUnitPrice"})
    
    st.write("### Processed Customer Data")
    st.write(customer_data.head())
    
    # Standardize the data
    scaler = StandardScaler()
    customer_scaled = pd.DataFrame(scaler.fit_transform(customer_data), columns=customer_data.columns, index=customer_data.index)
    
    # Elbow Method to determine optimal clusters
    st.write("### Elbow Method for Optimal Cluster Selection")
    distortions = []
    K = range(1, 11)
    for k in K:
        kmeans = KMeans(n_clusters=k, random_state=42)
        kmeans.fit(customer_scaled)
        distortions.append(kmeans.inertia_)
    
    fig, ax = plt.subplots()
    ax.plot(K, distortions, marker='o')
    ax.set_xlabel("Number of Clusters")
    ax.set_ylabel("Distortion")
    ax.set_title("Elbow Method for Optimal k")
    st.pyplot(fig)

# Customer Segmentation Tab
with tab3:
    # User selects the number of clusters
    num_clusters = st.slider("Select Number of Clusters", min_value=2, max_value=10, value=3)
    
    # Apply K-Means clustering
    model = KMeans(n_clusters=num_clusters, random_state=42)
    customer_data["Cluster"] = model.fit_predict(customer_scaled)
    
    # Visualize the clusters
    st.write("### Clusters Visualization")
    fig, ax = plt.subplots()
    scatter = ax.scatter(customer_data["TotalSpent"], customer_data["NumTransactions"], c=customer_data["Cluster"], cmap='viridis')
    ax.set_xlabel("Total Spent")
    ax.set_ylabel("Number of Transactions")
    ax.set_title("Customer Segments")
    plt.colorbar(scatter, label="Cluster")
    st.pyplot(fig)
    
    # Show the segmented customer data
    st.write("### Customer Segments Data")
    st.write(customer_data.groupby("Cluster").agg({"TotalSpent": "mean", "NumTransactions": "mean", "AvgUnitPrice": "mean"}))