chiichann commited on
Commit
0b333c7
·
verified ·
1 Parent(s): 53c2b9d

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +55 -83
app.py CHANGED
@@ -5,100 +5,72 @@ from sklearn.cluster import KMeans
5
  from sklearn.preprocessing import StandardScaler
6
  import matplotlib.pyplot as plt
7
  import seaborn as sns
8
- import requests
9
 
10
  # App title
11
  st.title("🛍️ Customer Segmentation Tool")
12
 
13
- tab1, tab2, tab3, tab4 = st.tabs(["📖 About", "📊 Dataset Overview", "🧑‍🤝‍🧑 Customer Segmentation", "📥 Download Dataset"])
 
14
 
15
  # About Tab
16
  with tab1:
17
  st.write("""
18
- This app uses unsupervised learning techniques to segment customers based on their purchasing behavior.
 
 
 
 
 
 
19
  """)
20
 
21
- # File uploader in the Dataset Tab
22
- with tab2:
23
- uploaded_file = st.file_uploader("Upload Your Dataset", type=["csv", "xlsx"])
24
-
25
- if uploaded_file is not None:
26
- try:
27
- if uploaded_file.name.endswith('.csv'):
28
- df = pd.read_csv(uploaded_file, encoding='ISO-8859-1', on_bad_lines='skip')
29
- elif uploaded_file.name.endswith('.xlsx'):
30
- df = pd.read_excel(uploaded_file)
31
- else:
32
- st.error("Unsupported file format. Please upload a CSV or Excel file.")
33
- st.stop()
34
-
35
- st.write("### Dataset Overview")
36
- st.write(df.head())
37
- except Exception as e:
38
- st.error(f"Error loading dataset: {e}")
39
- st.stop()
40
-
41
- st.write("### Columns detected in your dataset:")
42
- st.write(df.columns.tolist())
43
-
44
- customer_col = st.selectbox("Select Customer Column", df.columns.tolist())
45
- quantity_col = st.selectbox("Select Quantity Column", df.columns.tolist())
46
- unit_price_col = st.selectbox("Select Unit Price Column", df.columns.tolist())
47
-
48
- if customer_col not in df.columns or quantity_col not in df.columns or unit_price_col not in df.columns:
49
- st.error("One or more selected columns do not exist in the dataset. Please select valid columns.")
50
- st.stop()
51
-
52
- df = df.dropna(subset=[customer_col])
53
- df["TotalSpent"] = pd.to_numeric(df[quantity_col], errors='coerce') * pd.to_numeric(df[unit_price_col], errors='coerce')
54
- df = df.dropna(subset=["TotalSpent"])
55
 
56
- customer_data = df.groupby(customer_col).agg({
57
- "TotalSpent": "sum",
58
- quantity_col: "sum",
59
- unit_price_col: "mean"
60
- })
61
-
62
- # Debugging: Check column names before renaming
63
- st.write("### Columns before renaming:", customer_data.columns.tolist())
64
-
65
- customer_data = customer_data.rename(columns={quantity_col: "NumTransactions", unit_price_col: "AvgUnitPrice"})
66
-
67
- # Debugging: Check column names after renaming
68
- st.write("### Columns after renaming:", customer_data.columns.tolist())
69
-
70
- if "NumTransactions" not in customer_data.columns:
71
- st.error("Error: 'NumTransactions' column is missing after processing. Please check column mapping.")
72
- st.stop()
73
-
74
- scaler = StandardScaler()
75
- customer_scaled = pd.DataFrame(scaler.fit_transform(customer_data), columns=customer_data.columns, index=customer_data.index)
 
 
 
76
 
77
  # Customer Segmentation Tab
78
  with tab3:
79
- if uploaded_file is not None:
80
- num_clusters = st.slider("Select Number of Clusters", min_value=2, max_value=10, value=3)
81
- model = KMeans(n_clusters=num_clusters, random_state=42)
82
- customer_data["Cluster"] = model.fit_predict(customer_scaled)
83
-
84
- st.write("### Clusters Visualization")
85
- fig, ax = plt.subplots()
86
- scatter = ax.scatter(customer_data["TotalSpent"], customer_data["NumTransactions"], c=customer_data["Cluster"], cmap='viridis')
87
- ax.set_xlabel("Total Spent")
88
- ax.set_ylabel("Number of Transactions")
89
- ax.set_title("Customer Segments")
90
- plt.colorbar(scatter, label="Cluster")
91
- st.pyplot(fig)
92
-
93
- st.write("### Customer Segments Data")
94
- st.write(customer_data.head())
95
-
96
- csv = customer_data.to_csv(index=True)
97
- st.download_button(
98
- label="Download Segmented Customer Data",
99
- data=csv,
100
- file_name="segmented_customer_data.csv",
101
- mime="text/csv"
102
- )
103
- else:
104
- st.write("Please upload a dataset to start.")
 
5
  from sklearn.preprocessing import StandardScaler
6
  import matplotlib.pyplot as plt
7
  import seaborn as sns
 
8
 
9
  # App title
10
  st.title("🛍️ Customer Segmentation Tool")
11
 
12
+ # 🎯 Streamlit Tabs
13
+ tab1, tab2, tab3 = st.tabs(["📖 About", "📊 Dataset Overview", "🧑‍🤝‍🧑 Customer Segmentation"])
14
 
15
  # About Tab
16
  with tab1:
17
  st.write("""
18
+ This app uses unsupervised learning techniques to segment customers based on their purchasing behavior.
19
+ The dataset is preloaded and contains online retail data.
20
+ ### How It Works:
21
+ - **Step 1**: Load customer transaction data, including details like Quantity, UnitPrice, and CustomerID.
22
+ - **Step 2**: Process the data by calculating the total spent and aggregating the information by customer.
23
+ - **Step 3**: Apply **K-Means Clustering** to segment the customers into distinct groups.
24
+ - **Step 4**: Visualize the customer segments with a scatter plot.
25
  """)
26
 
27
+ # Load preloaded dataset
28
+ file_path = "/mnt/data/Online Retail.xlsx"
29
+ df = pd.read_excel(file_path, sheet_name='Online Retail')
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
30
 
31
+ # Dataset Overview Tab
32
+ with tab2:
33
+ st.write("### Dataset Overview")
34
+ st.write(df.head())
35
+
36
+ # Preprocess data
37
+ df = df.dropna(subset=["CustomerID"]) # Remove rows without CustomerID
38
+ df["TotalSpent"] = pd.to_numeric(df["Quantity"], errors='coerce') * pd.to_numeric(df["UnitPrice"], errors='coerce')
39
+ df = df.dropna(subset=["TotalSpent"])
40
+
41
+ # Aggregate data by Customer
42
+ customer_data = df.groupby("CustomerID").agg({
43
+ "TotalSpent": "sum",
44
+ "Quantity": "sum",
45
+ "UnitPrice": "mean"
46
+ }).rename(columns={"Quantity": "NumTransactions", "UnitPrice": "AvgUnitPrice"})
47
+
48
+ st.write("### Processed Customer Data")
49
+ st.write(customer_data.head())
50
+
51
+ # Standardize the data
52
+ scaler = StandardScaler()
53
+ customer_scaled = pd.DataFrame(scaler.fit_transform(customer_data), columns=customer_data.columns, index=customer_data.index)
54
 
55
  # Customer Segmentation Tab
56
  with tab3:
57
+ # User selects the number of clusters
58
+ num_clusters = st.slider("Select Number of Clusters", min_value=2, max_value=10, value=3)
59
+
60
+ # Apply K-Means clustering
61
+ model = KMeans(n_clusters=num_clusters, random_state=42)
62
+ customer_data["Cluster"] = model.fit_predict(customer_scaled)
63
+
64
+ # Visualize the clusters
65
+ st.write("### Clusters Visualization")
66
+ fig, ax = plt.subplots()
67
+ scatter = ax.scatter(customer_data["TotalSpent"], customer_data["NumTransactions"], c=customer_data["Cluster"], cmap='viridis')
68
+ ax.set_xlabel("Total Spent")
69
+ ax.set_ylabel("Number of Transactions")
70
+ ax.set_title("Customer Segments")
71
+ plt.colorbar(scatter, label="Cluster")
72
+ st.pyplot(fig)
73
+
74
+ # Show the segmented customer data
75
+ st.write("### Customer Segments Data")
76
+ st.write(customer_data.head())