Spaces:

chiichann
/

customer_segmentation_tool

Sleeping

App Files Files Community

chiichann commited on Mar 4, 2025

Commit

0b333c7

verified ·

1 Parent(s): 53c2b9d

Update app.py

Browse files

Files changed (1) hide show

app.py +55 -83

app.py CHANGED Viewed

@@ -5,100 +5,72 @@ from sklearn.cluster import KMeans
 from sklearn.preprocessing import StandardScaler
 import matplotlib.pyplot as plt
 import seaborn as sns
-import requests
 # App title
 st.title("🛍️ Customer Segmentation Tool")
-tab1, tab2, tab3, tab4 = st.tabs(["📖 About", "📊 Dataset Overview", "🧑‍🤝‍🧑 Customer Segmentation", "📥 Download Dataset"])
 # About Tab
 with tab1:
     st.write("""
-    This app uses unsupervised learning techniques to segment customers based on their purchasing behavior.
     """)
-# File uploader in the Dataset Tab
-with tab2:
-    uploaded_file = st.file_uploader("Upload Your Dataset", type=["csv", "xlsx"])
-    if uploaded_file is not None:
-        try:
-            if uploaded_file.name.endswith('.csv'):
-                df = pd.read_csv(uploaded_file, encoding='ISO-8859-1', on_bad_lines='skip')
-            elif uploaded_file.name.endswith('.xlsx'):
-                df = pd.read_excel(uploaded_file)
-            else:
-                st.error("Unsupported file format. Please upload a CSV or Excel file.")
-                st.stop()
-            st.write("### Dataset Overview")
-            st.write(df.head())
-        except Exception as e:
-            st.error(f"Error loading dataset: {e}")
-            st.stop()
-        st.write("### Columns detected in your dataset:")
-        st.write(df.columns.tolist())
-        customer_col = st.selectbox("Select Customer Column", df.columns.tolist())
-        quantity_col = st.selectbox("Select Quantity Column", df.columns.tolist())
-        unit_price_col = st.selectbox("Select Unit Price Column", df.columns.tolist())
-        if customer_col not in df.columns or quantity_col not in df.columns or unit_price_col not in df.columns:
-            st.error("One or more selected columns do not exist in the dataset. Please select valid columns.")
-            st.stop()
-        df = df.dropna(subset=[customer_col])
-        df["TotalSpent"] = pd.to_numeric(df[quantity_col], errors='coerce') * pd.to_numeric(df[unit_price_col], errors='coerce')
-        df = df.dropna(subset=["TotalSpent"])
-        customer_data = df.groupby(customer_col).agg({
-            "TotalSpent": "sum",
-            quantity_col: "sum",
-            unit_price_col: "mean"
-        })
-        # Debugging: Check column names before renaming
-        st.write("### Columns before renaming:", customer_data.columns.tolist())
-        customer_data = customer_data.rename(columns={quantity_col: "NumTransactions", unit_price_col: "AvgUnitPrice"})
-        # Debugging: Check column names after renaming
-        st.write("### Columns after renaming:", customer_data.columns.tolist())
-        if "NumTransactions" not in customer_data.columns:
-            st.error("Error: 'NumTransactions' column is missing after processing. Please check column mapping.")
-            st.stop()
-        scaler = StandardScaler()
-        customer_scaled = pd.DataFrame(scaler.fit_transform(customer_data), columns=customer_data.columns, index=customer_data.index)
 # Customer Segmentation Tab
 with tab3:
-    if uploaded_file is not None:
-        num_clusters = st.slider("Select Number of Clusters", min_value=2, max_value=10, value=3)
-        model = KMeans(n_clusters=num_clusters, random_state=42)
-        customer_data["Cluster"] = model.fit_predict(customer_scaled)
-        st.write("### Clusters Visualization")
-        fig, ax = plt.subplots()
-        scatter = ax.scatter(customer_data["TotalSpent"], customer_data["NumTransactions"], c=customer_data["Cluster"], cmap='viridis')
-        ax.set_xlabel("Total Spent")
-        ax.set_ylabel("Number of Transactions")
-        ax.set_title("Customer Segments")
-        plt.colorbar(scatter, label="Cluster")
-        st.pyplot(fig)
-        st.write("### Customer Segments Data")
-        st.write(customer_data.head())
-        csv = customer_data.to_csv(index=True)
-        st.download_button(
-            label="Download Segmented Customer Data",
-            data=csv,
-            file_name="segmented_customer_data.csv",
-            mime="text/csv"
-        )
-    else:
-        st.write("Please upload a dataset to start.")

 from sklearn.preprocessing import StandardScaler
 import matplotlib.pyplot as plt
 import seaborn as sns
 # App title
 st.title("🛍️ Customer Segmentation Tool")
+# 🎯 Streamlit Tabs
+tab1, tab2, tab3 = st.tabs(["📖 About", "📊 Dataset Overview", "🧑‍🤝‍🧑 Customer Segmentation"])
 # About Tab
 with tab1:
     st.write("""
+    This app uses unsupervised learning techniques to segment customers based on their purchasing behavior.
+    The dataset is preloaded and contains online retail data.
+    ### How It Works:
+    - **Step 1**: Load customer transaction data, including details like Quantity, UnitPrice, and CustomerID.
+    - **Step 2**: Process the data by calculating the total spent and aggregating the information by customer.
+    - **Step 3**: Apply **K-Means Clustering** to segment the customers into distinct groups.
+    - **Step 4**: Visualize the customer segments with a scatter plot.
     """)
+# Load preloaded dataset
+file_path = "/mnt/data/Online Retail.xlsx"
+df = pd.read_excel(file_path, sheet_name='Online Retail')
+# Dataset Overview Tab
+with tab2:
+    st.write("### Dataset Overview")
+    st.write(df.head())
+    # Preprocess data
+    df = df.dropna(subset=["CustomerID"])  # Remove rows without CustomerID
+    df["TotalSpent"] = pd.to_numeric(df["Quantity"], errors='coerce') * pd.to_numeric(df["UnitPrice"], errors='coerce')
+    df = df.dropna(subset=["TotalSpent"])
+    # Aggregate data by Customer
+    customer_data = df.groupby("CustomerID").agg({
+        "TotalSpent": "sum",
+        "Quantity": "sum",
+        "UnitPrice": "mean"
+    }).rename(columns={"Quantity": "NumTransactions", "UnitPrice": "AvgUnitPrice"})
+    st.write("### Processed Customer Data")
+    st.write(customer_data.head())
+    # Standardize the data
+    scaler = StandardScaler()
+    customer_scaled = pd.DataFrame(scaler.fit_transform(customer_data), columns=customer_data.columns, index=customer_data.index)
 # Customer Segmentation Tab
 with tab3:
+    # User selects the number of clusters
+    num_clusters = st.slider("Select Number of Clusters", min_value=2, max_value=10, value=3)
+    # Apply K-Means clustering
+    model = KMeans(n_clusters=num_clusters, random_state=42)
+    customer_data["Cluster"] = model.fit_predict(customer_scaled)
+    # Visualize the clusters
+    st.write("### Clusters Visualization")
+    fig, ax = plt.subplots()
+    scatter = ax.scatter(customer_data["TotalSpent"], customer_data["NumTransactions"], c=customer_data["Cluster"], cmap='viridis')
+    ax.set_xlabel("Total Spent")
+    ax.set_ylabel("Number of Transactions")
+    ax.set_title("Customer Segments")
+    plt.colorbar(scatter, label="Cluster")
+    st.pyplot(fig)
+    # Show the segmented customer data
+    st.write("### Customer Segments Data")
+    st.write(customer_data.head())