Spaces:

chiichann
/

customer_segmentation_tool

Sleeping

App Files Files Community

chiichann commited on Mar 4, 2025

Commit

6ba4689

verified ·

1 Parent(s): f153315

Update app.py

Browse files

Files changed (1) hide show

app.py +124 -91

app.py CHANGED Viewed

@@ -1,91 +1,124 @@
-import streamlit as st
-import pandas as pd
-import numpy as np
-from sklearn.cluster import KMeans
-from sklearn.preprocessing import StandardScaler
-import matplotlib.pyplot as plt
-import seaborn as sns
-# App title
-st.title("🛍️ Customer Segmentation Tool")
-# 🎯 Streamlit Tabs
-tab1, tab2, tab3 = st.tabs(["📖 About", "📊 Dataset Overview", "🧑‍🤝‍🧑 Customer Segmentation"])
-# About Tab
-with tab1:
-    st.write("""
-    This app uses unsupervised learning techniques to segment customers based on their purchasing behavior.
-    The dataset is preloaded from an Excel file containing online retail data.
-    ### How It Works:
-    - **Step 1**: Load customer transaction data, including details like `Quantity`, `UnitPrice`, and `CustomerID`.
-    - **Step 2**: Process the data by calculating the total spent and aggregating the information by customer.
-    - **Step 3**: Apply **K-Means Clustering** to segment the customers into distinct groups.
-    - **Step 4**: Visualize the customer segments with a scatter plot, and optionally download the segmented data.
-    """)
-# Load dataset
-file_path = "Online Retail.xlsx"
-# Dataset Tab
-with tab2:
-    try:
-        df = pd.read_excel(file_path, sheet_name="Online Retail")
-        st.write("### Dataset Overview")
-        st.write(df.head())
-    except Exception as e:
-        st.error(f"Error loading dataset: {e}")
-        st.stop()
-    # Verify the dataset columns
-    if not all(col in df.columns for col in ["CustomerID", "Quantity", "UnitPrice"]):
-        st.error("The dataset is missing required columns: 'CustomerID', 'Quantity', 'UnitPrice'. Please check the data.")
-        st.stop()
-# Preprocess data
-df = df.dropna(subset=["CustomerID"])  # Remove rows without CustomerID
-df["TotalSpent"] = df["Quantity"] * df["UnitPrice"]  # Create TotalSpent column
-# Aggregate data by CustomerID
-customer_data = df.groupby("CustomerID").agg({
-    "TotalSpent": "sum",
-    "InvoiceNo": "nunique",
-    "Quantity": "sum"
-}).rename(columns={"InvoiceNo": "NumTransactions"})
-# Standardize the data
-scaler = StandardScaler()
-customer_scaled = pd.DataFrame(scaler.fit_transform(customer_data), columns=customer_data.columns, index=customer_data.index)
-# Customer Segmentation Tab
-with tab3:
-    # User selects the number of clusters
-    num_clusters = st.slider("Select Number of Clusters", min_value=2, max_value=10, value=3)
-    # Apply K-Means clustering
-    model = KMeans(n_clusters=num_clusters, random_state=42)
-    customer_data["Cluster"] = model.fit_predict(customer_scaled)
-    # Visualize the clusters
-    st.write("### Clusters Visualization")
-    fig, ax = plt.subplots()
-    scatter = ax.scatter(customer_data["TotalSpent"], customer_data["NumTransactions"], c=customer_data["Cluster"], cmap='viridis')
-    ax.set_xlabel("Total Spent")
-    ax.set_ylabel("Number of Transactions")
-    ax.set_title("Customer Segments")
-    plt.colorbar(scatter, label="Cluster")
-    st.pyplot(fig)
-    # Show the segmented customer data
-    st.write("### Customer Segments Data")
-    st.write(customer_data.head())
-    # Option to download the segmented data
-    csv = customer_data.to_csv(index=True)
-    st.download_button(
-        label="Download Segmented Customer Data",
-        data=csv,
-        file_name="segmented_customer_data.csv",
-        mime="text/csv"
-    )

+import streamlit as st
+import pandas as pd
+import numpy as np
+from sklearn.cluster import KMeans
+from sklearn.preprocessing import StandardScaler
+import matplotlib.pyplot as plt
+import seaborn as sns
+# App title
+st.title("🛍️ Customer Segmentation Tool")
+# 🎯 Streamlit Tabs
+tab1, tab2, tab3 = st.tabs(["📖 About", "📊 Dataset Overview", "🧑‍🤝‍🧑 Customer Segmentation", "📥 Download Dataset"])
+# About Tab
+with tab1:
+    st.write("""
+    This app uses unsupervised learning techniques to segment customers based on their purchasing behavior.
+    The dataset is uploaded by the user, containing online retail data.
+    ### How It Works:
+    - **Step 1**: Upload customer transaction data, including details like `Quantity`, `UnitPrice`, and `CustomerID`.
+    - **Step 2**: Process the data by calculating the total spent and aggregating the information by customer.
+    - **Step 3**: Apply **K-Means Clustering** to segment the customers into distinct groups.
+    - **Step 4**: Visualize the customer segments with a scatter plot, and optionally download the segmented data.
+    """)
+# File uploader in the Dataset Tab
+with tab2:
+    uploaded_file = st.file_uploader("Upload Your Dataset", type=["csv"])
+    if uploaded_file is not None:
+        try:
+            # Load the CSV file
+            df = pd.read_csv(uploaded_file)
+            st.write("### Dataset Overview")
+            st.write(df.head())
+        except Exception as e:
+            st.error(f"Error loading dataset: {e}")
+            st.stop()
+        # Automatically detect possible columns
+        st.write("### Columns detected in your dataset:")
+        st.write(df.columns.tolist())
+        # Allow the user to map columns
+        customer_col = st.selectbox("Select Customer Column", df.columns.tolist(), index=df.columns.tolist().index("CustomerID") if "CustomerID" in df.columns else 0)
+        quantity_col = st.selectbox("Select Quantity Column", df.columns.tolist(), index=df.columns.tolist().index("Quantity") if "Quantity" in df.columns else 0)
+        unit_price_col = st.selectbox("Select Unit Price Column", df.columns.tolist(), index=df.columns.tolist().index("UnitPrice") if "UnitPrice" in df.columns else 0)
+        # Check if the selected columns exist
+        if customer_col not in df.columns or quantity_col not in df.columns or unit_price_col not in df.columns:
+            st.error("One or more selected columns do not exist in the dataset. Please select valid columns.")
+            st.stop()
+        # Preprocess data
+        df = df.dropna(subset=[customer_col])  # Remove rows without CustomerID
+        df["TotalSpent"] = df[quantity_col] * df[unit_price_col]  # Create TotalSpent column
+        # Aggregate data by Customer
+        customer_data = df.groupby(customer_col).agg({
+            "TotalSpent": "sum",
+            quantity_col: "sum",
+            unit_price_col: "mean"
+        }).rename(columns={quantity_col: "NumTransactions", unit_price_col: "AvgUnitPrice"})
+        # Standardize the data
+        scaler = StandardScaler()
+        customer_scaled = pd.DataFrame(scaler.fit_transform(customer_data), columns=customer_data.columns, index=customer_data.index)
+# Customer Segmentation Tab
+with tab3:
+    if uploaded_file is not None:
+        # User selects the number of clusters
+        num_clusters = st.slider("Select Number of Clusters", min_value=2, max_value=10, value=3)
+        # Apply K-Means clustering
+        model = KMeans(n_clusters=num_clusters, random_state=42)
+        customer_data["Cluster"] = model.fit_predict(customer_scaled)
+        # Visualize the clusters
+        st.write("### Clusters Visualization")
+        fig, ax = plt.subplots()
+        scatter = ax.scatter(customer_data["TotalSpent"], customer_data["NumTransactions"], c=customer_data["Cluster"], cmap='viridis')
+        ax.set_xlabel("Total Spent")
+        ax.set_ylabel("Number of Transactions")
+        ax.set_title("Customer Segments")
+        plt.colorbar(scatter, label="Cluster")
+        st.pyplot(fig)
+        # Show the segmented customer data
+        st.write("### Customer Segments Data")
+        st.write(customer_data.head())
+        # Option to download the segmented data
+        csv = customer_data.to_csv(index=True)
+        st.download_button(
+            label="Download Segmented Customer Data",
+            data=csv,
+            file_name="segmented_customer_data.csv",
+            mime="text/csv"
+        )
+    else:
+        st.write("Please upload a dataset to start.")
+# Download Dataset Tab
+with tab4:
+    st.write("""
+    You can download the sample 'Online Retail' dataset to get started with customer segmentation tasks.
+    Click the button below to download the dataset in CSV format.
+    """)
+    # Dataset file path (local or cloud-based)
+    # For example, we can provide a URL link to an external dataset or include a local file download.
+    dataset_url = "https://path_to_your_dataset/Online_Retail.csv"  # Replace with actual URL if necessary
+    # Button to download the sample dataset
+    st.download_button(
+        label="Download Online Retail Dataset",
+        data=pd.read_csv(dataset_url).to_csv(index=False),
+        file_name="Online_Retail.csv",
+        mime="text/csv"
+    )