Spaces:

chiichann
/

customer_segmentation_tool

Sleeping

App Files Files Community

chiichann commited on Mar 4, 2025

Commit

c68d93e

verified ·

1 Parent(s): 7766924

Update app.py

Browse files

Files changed (1) hide show

app.py +50 -109

app.py CHANGED Viewed

@@ -5,97 +5,83 @@ from sklearn.cluster import KMeans
 from sklearn.preprocessing import StandardScaler
 import matplotlib.pyplot as plt
 import seaborn as sns
-import requests
-from io import StringIO
 # App title
 st.title("🛍️ Customer Segmentation Tool")
-# 🎯 Streamlit Tabs
 tab1, tab2, tab3, tab4 = st.tabs(["📖 About", "📊 Dataset Overview", "🧑‍🤝‍🧑 Customer Segmentation", "📥 Download Dataset"])
 # About Tab
 with tab1:
     st.write("""
-    This app uses unsupervised learning techniques to segment customers based on their purchasing behavior.
-    The dataset is uploaded by the user, containing online retail data.
-    ### How It Works:
-    - **Step 1**: Upload customer transaction data, including details like `Quantity`, `UnitPrice`, and `CustomerID`.
-    - **Step 2**: Process the data by calculating the total spent and aggregating the information by customer.
-    - **Step 3**: Apply **K-Means Clustering** to segment the customers into distinct groups.
-    - **Step 4**: Visualize the customer segments with a scatter plot, and optionally download the segmented data.
     """)
 # File uploader in the Dataset Tab
 with tab2:
-    uploaded_file = st.file_uploader("Upload Your Dataset", type=["csv", "xlsx"])
-    if uploaded_file is not None:
-        try:
-            # Check file type
-            if uploaded_file.name.endswith('.csv'):
-                # Read CSV file with error handling for malformed lines
-                df = pd.read_csv(uploaded_file, encoding='ISO-8859-1', on_bad_lines='skip')
-            elif uploaded_file.name.endswith('.xlsx'):
-                # Read Excel file
-                df = pd.read_excel(uploaded_file)
-            else:
-                st.error("Unsupported file format. Please upload a CSV or Excel file.")
-                st.stop()
-            st.write("### Dataset Overview")
-            st.write(df.head())
-        except Exception as e:
-            st.error(f"Error loading dataset: {e}")
-            st.stop()
-        # Automatically detect possible columns
-        st.write("### Columns detected in your dataset:")
-        st.write(df.columns.tolist())
-        # Allow the user to map columns
-        customer_col = st.selectbox("Select Customer Column", df.columns.tolist(), index=df.columns.tolist().index("CustomerID") if "CustomerID" in df.columns else 0)
-        quantity_col = st.selectbox("Select Quantity Column", df.columns.tolist(), index=df.columns.tolist().index("Quantity") if "Quantity" in df.columns else 0)
-        unit_price_col = st.selectbox("Select Unit Price Column", df.columns.tolist(), index=df.columns.tolist().index("UnitPrice") if "UnitPrice" in df.columns else 0)
-        # Check if the selected columns exist
-        if customer_col not in df.columns or quantity_col not in df.columns or unit_price_col not in df.columns:
-            st.error("One or more selected columns do not exist in the dataset. Please select valid columns.")
-            st.stop()
-        # Preprocess data
-        df = df.dropna(subset=[customer_col])  # Remove rows without CustomerID
-        df["TotalSpent"] = pd.to_numeric(df[quantity_col], errors='coerce') * pd.to_numeric(df[unit_price_col], errors='coerce')  # Ensure numeric type
-        # Ensure that TotalSpent column is not NaN
         df = df.dropna(subset=["TotalSpent"])
-        # Aggregate data by Customer
         customer_data = df.groupby(customer_col).agg({
             "TotalSpent": "sum",
             quantity_col: "sum",
             unit_price_col: "mean"
         }).rename(columns={quantity_col: "NumTransactions", unit_price_col: "AvgUnitPrice"})
-        # Debug: Check if 'NumTransactions' exists in the DataFrame
-        st.write("### Available columns in the aggregated customer data:")
-        st.write(customer_data.columns.tolist())
-        # Standardize the data
         scaler = StandardScaler()
         customer_scaled = pd.DataFrame(scaler.fit_transform(customer_data), columns=customer_data.columns, index=customer_data.index)
-# Customer Segmentation Tab
-with tab3:
-    if uploaded_file is not None:
-        # User selects the number of clusters
         num_clusters = st.slider("Select Number of Clusters", min_value=2, max_value=10, value=3)
-        # Apply K-Means clustering
         model = KMeans(n_clusters=num_clusters, random_state=42)
         customer_data["Cluster"] = model.fit_predict(customer_scaled)
-        # Visualize the clusters
         st.write("### Clusters Visualization")
         fig, ax = plt.subplots()
         scatter = ax.scatter(customer_data["TotalSpent"], customer_data["NumTransactions"], c=customer_data["Cluster"], cmap='viridis')
@@ -105,52 +91,7 @@ with tab3:
         plt.colorbar(scatter, label="Cluster")
         st.pyplot(fig)
-        # Show the segmented customer data
-        st.write("### Customer Segments Data")
-        st.write(customer_data.head())
-        # Option to download the segmented data
         csv = customer_data.to_csv(index=True)
-        st.download_button(
-            label="Download Segmented Customer Data",
-            data=csv,
-            file_name="segmented_customer_data.csv",
-            mime="text/csv"
-        )
     else:
-        st.write("Please upload a dataset to start.")
-# Download Dataset Tab
-with tab4:
-    st.write("""
-    You can download the sample 'Online Retail' dataset to get started with customer segmentation tasks.
-    Click the button below to download the dataset in CSV format.
-    """)
-    # Direct Google Drive link to the 'Online Retail' dataset (for direct download)
-    dataset_url_online_retail = "https://drive.google.com/uc?id=1djBqO2sdHfy9DGZQXZu2Er8LUUXtp9Kr&export=download"
-    # Direct Google Drive link to the new dataset (for direct download)
-    dataset_url_new_file = "https://drive.google.com/uc?id=1PbGJSdcyDInsu-9Ua4iHzQh-YpVk_RqT&export=download"
-    # Download the file from the URLs
-    response_online_retail = requests.get(dataset_url_online_retail)
-    file_data_online_retail = response_online_retail.text  # Get the content as text
-    response_new_file = requests.get(dataset_url_new_file)
-    file_data_new_file = response_new_file.text  # Get the content as text
-    # Convert the CSV data into a CSV download for Streamlit
-    st.download_button(
-        label="Download Online Retail Dataset",
-        data=file_data_online_retail,
-        file_name="Online_Retail.csv",
-        mime="text/csv"
-    )
-    st.download_button(
-        label="Download New Dataset",
-        data=file_data_new_file,
-        file_name="New_Dataset.csv",
-        mime="text/csv"
-    )

 from sklearn.preprocessing import StandardScaler
 import matplotlib.pyplot as plt
 import seaborn as sns
 # App title
 st.title("🛍️ Customer Segmentation Tool")
+# Streamlit Tabs
 tab1, tab2, tab3, tab4 = st.tabs(["📖 About", "📊 Dataset Overview", "🧑‍🤝‍🧑 Customer Segmentation", "📥 Download Dataset"])
 # About Tab
 with tab1:
     st.write("""
+    This app segments customers based on their purchasing behavior using unsupervised learning.
+    You can upload one or two datasets for analysis.
     """)
 # File uploader in the Dataset Tab
 with tab2:
+    uploaded_file1 = st.file_uploader("Upload First Dataset", type=["csv", "xlsx"], key="file1")
+    uploaded_file2 = st.file_uploader("Upload Second Dataset (Optional)", type=["csv", "xlsx"], key="file2")
+    def load_data(uploaded_file):
+        if uploaded_file is not None:
+            try:
+                if uploaded_file.name.endswith('.csv'):
+                    df = pd.read_csv(uploaded_file, encoding='ISO-8859-1', on_bad_lines='skip')
+                elif uploaded_file.name.endswith('.xlsx'):
+                    df = pd.read_excel(uploaded_file)
+                return df
+            except Exception as e:
+                st.error(f"Error loading dataset: {e}")
+        return None
+    df1 = load_data(uploaded_file1)
+    df2 = load_data(uploaded_file2)
+    if df1 is not None:
+        st.write("### First Dataset Overview")
+        st.write(df1.head())
+    if df2 is not None:
+        st.write("### Second Dataset Overview")
+        st.write(df2.head())
+    if df1 is not None and df2 is not None:
+        merge_option = st.radio("How would you like to combine the datasets?", ("Concatenate", "Keep Separate"))
+        if merge_option == "Concatenate":
+            df = pd.concat([df1, df2], ignore_index=True)
+        else:
+            df = None  # Handle separately in clustering
+    else:
+        df = df1 if df1 is not None else df2
+# Customer Segmentation Tab
+with tab3:
+    if df is not None:
+        # Column selection
+        st.write("### Select Columns")
+        customer_col = st.selectbox("Select Customer Column", df.columns.tolist(), index=0)
+        quantity_col = st.selectbox("Select Quantity Column", df.columns.tolist(), index=0)
+        unit_price_col = st.selectbox("Select Unit Price Column", df.columns.tolist(), index=0)
+        df = df.dropna(subset=[customer_col])
+        df["TotalSpent"] = pd.to_numeric(df[quantity_col], errors='coerce') * pd.to_numeric(df[unit_price_col], errors='coerce')
         df = df.dropna(subset=["TotalSpent"])
         customer_data = df.groupby(customer_col).agg({
             "TotalSpent": "sum",
             quantity_col: "sum",
             unit_price_col: "mean"
         }).rename(columns={quantity_col: "NumTransactions", unit_price_col: "AvgUnitPrice"})
         scaler = StandardScaler()
         customer_scaled = pd.DataFrame(scaler.fit_transform(customer_data), columns=customer_data.columns, index=customer_data.index)
         num_clusters = st.slider("Select Number of Clusters", min_value=2, max_value=10, value=3)
         model = KMeans(n_clusters=num_clusters, random_state=42)
         customer_data["Cluster"] = model.fit_predict(customer_scaled)
         st.write("### Clusters Visualization")
         fig, ax = plt.subplots()
         scatter = ax.scatter(customer_data["TotalSpent"], customer_data["NumTransactions"], c=customer_data["Cluster"], cmap='viridis')
         plt.colorbar(scatter, label="Cluster")
         st.pyplot(fig)
         csv = customer_data.to_csv(index=True)
+        st.download_button("Download Segmented Customer Data", data=csv, file_name="segmented_customer_data.csv", mime="text/csv")
     else:
+        st.write("Please upload at least one dataset to start.")