chiichann commited on
Commit
6ba4689
·
verified ·
1 Parent(s): f153315

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +124 -91
app.py CHANGED
@@ -1,91 +1,124 @@
1
- import streamlit as st
2
- import pandas as pd
3
- import numpy as np
4
- from sklearn.cluster import KMeans
5
- from sklearn.preprocessing import StandardScaler
6
- import matplotlib.pyplot as plt
7
- import seaborn as sns
8
-
9
- # App title
10
- st.title("🛍️ Customer Segmentation Tool")
11
-
12
- # 🎯 Streamlit Tabs
13
- tab1, tab2, tab3 = st.tabs(["📖 About", "📊 Dataset Overview", "🧑‍🤝‍🧑 Customer Segmentation"])
14
-
15
- # About Tab
16
- with tab1:
17
- st.write("""
18
- This app uses unsupervised learning techniques to segment customers based on their purchasing behavior.
19
- The dataset is preloaded from an Excel file containing online retail data.
20
-
21
- ### How It Works:
22
- - **Step 1**: Load customer transaction data, including details like `Quantity`, `UnitPrice`, and `CustomerID`.
23
- - **Step 2**: Process the data by calculating the total spent and aggregating the information by customer.
24
- - **Step 3**: Apply **K-Means Clustering** to segment the customers into distinct groups.
25
- - **Step 4**: Visualize the customer segments with a scatter plot, and optionally download the segmented data.
26
- """)
27
-
28
- # Load dataset
29
- file_path = "Online Retail.xlsx"
30
-
31
- # Dataset Tab
32
- with tab2:
33
- try:
34
- df = pd.read_excel(file_path, sheet_name="Online Retail")
35
- st.write("### Dataset Overview")
36
- st.write(df.head())
37
- except Exception as e:
38
- st.error(f"Error loading dataset: {e}")
39
- st.stop()
40
-
41
- # Verify the dataset columns
42
- if not all(col in df.columns for col in ["CustomerID", "Quantity", "UnitPrice"]):
43
- st.error("The dataset is missing required columns: 'CustomerID', 'Quantity', 'UnitPrice'. Please check the data.")
44
- st.stop()
45
-
46
- # Preprocess data
47
- df = df.dropna(subset=["CustomerID"]) # Remove rows without CustomerID
48
- df["TotalSpent"] = df["Quantity"] * df["UnitPrice"] # Create TotalSpent column
49
-
50
- # Aggregate data by CustomerID
51
- customer_data = df.groupby("CustomerID").agg({
52
- "TotalSpent": "sum",
53
- "InvoiceNo": "nunique",
54
- "Quantity": "sum"
55
- }).rename(columns={"InvoiceNo": "NumTransactions"})
56
-
57
- # Standardize the data
58
- scaler = StandardScaler()
59
- customer_scaled = pd.DataFrame(scaler.fit_transform(customer_data), columns=customer_data.columns, index=customer_data.index)
60
-
61
- # Customer Segmentation Tab
62
- with tab3:
63
- # User selects the number of clusters
64
- num_clusters = st.slider("Select Number of Clusters", min_value=2, max_value=10, value=3)
65
-
66
- # Apply K-Means clustering
67
- model = KMeans(n_clusters=num_clusters, random_state=42)
68
- customer_data["Cluster"] = model.fit_predict(customer_scaled)
69
-
70
- # Visualize the clusters
71
- st.write("### Clusters Visualization")
72
- fig, ax = plt.subplots()
73
- scatter = ax.scatter(customer_data["TotalSpent"], customer_data["NumTransactions"], c=customer_data["Cluster"], cmap='viridis')
74
- ax.set_xlabel("Total Spent")
75
- ax.set_ylabel("Number of Transactions")
76
- ax.set_title("Customer Segments")
77
- plt.colorbar(scatter, label="Cluster")
78
- st.pyplot(fig)
79
-
80
- # Show the segmented customer data
81
- st.write("### Customer Segments Data")
82
- st.write(customer_data.head())
83
-
84
- # Option to download the segmented data
85
- csv = customer_data.to_csv(index=True)
86
- st.download_button(
87
- label="Download Segmented Customer Data",
88
- data=csv,
89
- file_name="segmented_customer_data.csv",
90
- mime="text/csv"
91
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import pandas as pd
3
+ import numpy as np
4
+ from sklearn.cluster import KMeans
5
+ from sklearn.preprocessing import StandardScaler
6
+ import matplotlib.pyplot as plt
7
+ import seaborn as sns
8
+
9
+ # App title
10
+ st.title("🛍️ Customer Segmentation Tool")
11
+
12
+ # 🎯 Streamlit Tabs
13
+ tab1, tab2, tab3 = st.tabs(["📖 About", "📊 Dataset Overview", "🧑‍🤝‍🧑 Customer Segmentation", "📥 Download Dataset"])
14
+
15
+ # About Tab
16
+ with tab1:
17
+ st.write("""
18
+ This app uses unsupervised learning techniques to segment customers based on their purchasing behavior.
19
+ The dataset is uploaded by the user, containing online retail data.
20
+
21
+ ### How It Works:
22
+ - **Step 1**: Upload customer transaction data, including details like `Quantity`, `UnitPrice`, and `CustomerID`.
23
+ - **Step 2**: Process the data by calculating the total spent and aggregating the information by customer.
24
+ - **Step 3**: Apply **K-Means Clustering** to segment the customers into distinct groups.
25
+ - **Step 4**: Visualize the customer segments with a scatter plot, and optionally download the segmented data.
26
+ """)
27
+
28
+ # File uploader in the Dataset Tab
29
+ with tab2:
30
+ uploaded_file = st.file_uploader("Upload Your Dataset", type=["csv"])
31
+
32
+ if uploaded_file is not None:
33
+ try:
34
+ # Load the CSV file
35
+ df = pd.read_csv(uploaded_file)
36
+
37
+ st.write("### Dataset Overview")
38
+ st.write(df.head())
39
+ except Exception as e:
40
+ st.error(f"Error loading dataset: {e}")
41
+ st.stop()
42
+
43
+ # Automatically detect possible columns
44
+ st.write("### Columns detected in your dataset:")
45
+ st.write(df.columns.tolist())
46
+
47
+ # Allow the user to map columns
48
+ customer_col = st.selectbox("Select Customer Column", df.columns.tolist(), index=df.columns.tolist().index("CustomerID") if "CustomerID" in df.columns else 0)
49
+ quantity_col = st.selectbox("Select Quantity Column", df.columns.tolist(), index=df.columns.tolist().index("Quantity") if "Quantity" in df.columns else 0)
50
+ unit_price_col = st.selectbox("Select Unit Price Column", df.columns.tolist(), index=df.columns.tolist().index("UnitPrice") if "UnitPrice" in df.columns else 0)
51
+
52
+ # Check if the selected columns exist
53
+ if customer_col not in df.columns or quantity_col not in df.columns or unit_price_col not in df.columns:
54
+ st.error("One or more selected columns do not exist in the dataset. Please select valid columns.")
55
+ st.stop()
56
+
57
+ # Preprocess data
58
+ df = df.dropna(subset=[customer_col]) # Remove rows without CustomerID
59
+ df["TotalSpent"] = df[quantity_col] * df[unit_price_col] # Create TotalSpent column
60
+
61
+ # Aggregate data by Customer
62
+ customer_data = df.groupby(customer_col).agg({
63
+ "TotalSpent": "sum",
64
+ quantity_col: "sum",
65
+ unit_price_col: "mean"
66
+ }).rename(columns={quantity_col: "NumTransactions", unit_price_col: "AvgUnitPrice"})
67
+
68
+ # Standardize the data
69
+ scaler = StandardScaler()
70
+ customer_scaled = pd.DataFrame(scaler.fit_transform(customer_data), columns=customer_data.columns, index=customer_data.index)
71
+
72
+ # Customer Segmentation Tab
73
+ with tab3:
74
+ if uploaded_file is not None:
75
+ # User selects the number of clusters
76
+ num_clusters = st.slider("Select Number of Clusters", min_value=2, max_value=10, value=3)
77
+
78
+ # Apply K-Means clustering
79
+ model = KMeans(n_clusters=num_clusters, random_state=42)
80
+ customer_data["Cluster"] = model.fit_predict(customer_scaled)
81
+
82
+ # Visualize the clusters
83
+ st.write("### Clusters Visualization")
84
+ fig, ax = plt.subplots()
85
+ scatter = ax.scatter(customer_data["TotalSpent"], customer_data["NumTransactions"], c=customer_data["Cluster"], cmap='viridis')
86
+ ax.set_xlabel("Total Spent")
87
+ ax.set_ylabel("Number of Transactions")
88
+ ax.set_title("Customer Segments")
89
+ plt.colorbar(scatter, label="Cluster")
90
+ st.pyplot(fig)
91
+
92
+ # Show the segmented customer data
93
+ st.write("### Customer Segments Data")
94
+ st.write(customer_data.head())
95
+
96
+ # Option to download the segmented data
97
+ csv = customer_data.to_csv(index=True)
98
+ st.download_button(
99
+ label="Download Segmented Customer Data",
100
+ data=csv,
101
+ file_name="segmented_customer_data.csv",
102
+ mime="text/csv"
103
+ )
104
+ else:
105
+ st.write("Please upload a dataset to start.")
106
+
107
+ # Download Dataset Tab
108
+ with tab4:
109
+ st.write("""
110
+ You can download the sample 'Online Retail' dataset to get started with customer segmentation tasks.
111
+ Click the button below to download the dataset in CSV format.
112
+ """)
113
+
114
+ # Dataset file path (local or cloud-based)
115
+ # For example, we can provide a URL link to an external dataset or include a local file download.
116
+ dataset_url = "https://path_to_your_dataset/Online_Retail.csv" # Replace with actual URL if necessary
117
+
118
+ # Button to download the sample dataset
119
+ st.download_button(
120
+ label="Download Online Retail Dataset",
121
+ data=pd.read_csv(dataset_url).to_csv(index=False),
122
+ file_name="Online_Retail.csv",
123
+ mime="text/csv"
124
+ )