mherlie commited on
Commit
dd9dd07
·
1 Parent(s): ab29971

modified code and added dataset

Browse files
Files changed (4) hide show
  1. .gitignore +2 -0
  2. Test.csv +0 -0
  3. app.py +87 -0
  4. requirements.txt +6 -0
.gitignore ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ .env
2
+ venv/
Test.csv ADDED
The diff for this file is too large to render. See raw diff
 
app.py ADDED
@@ -0,0 +1,87 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import pandas as pd
3
+ import numpy as np
4
+ import matplotlib.pyplot as plt
5
+ import seaborn as sns
6
+ from sklearn.cluster import KMeans
7
+ from sklearn.preprocessing import StandardScaler, OneHotEncoder
8
+ from sklearn.decomposition import PCA
9
+
10
+ # Load dataset
11
+ st.title("Customer Segmentation App")
12
+
13
+ # File uploader
14
+ dataset_file = st.file_uploader("Upload your CSV file", type=["csv"])
15
+
16
+ if dataset_file is not None:
17
+ df = pd.read_csv(dataset_file)
18
+ st.write("### Preview of Uploaded Data:")
19
+ st.dataframe(df.head())
20
+
21
+ # Drop rows with missing values in the entire dataset
22
+ df.dropna(inplace=True)
23
+
24
+ # Select features for clustering
25
+ st.write("### Select Features for Clustering")
26
+ selected_features = st.multiselect("Choose features", df.columns.tolist(), default=df.columns.tolist())
27
+
28
+ if selected_features:
29
+ data = df[selected_features]
30
+
31
+ # Identify categorical and numerical features
32
+ categorical_cols = data.select_dtypes(include=['object']).columns.tolist()
33
+ numerical_cols = data.select_dtypes(include=['number']).columns.tolist()
34
+
35
+ # Encode categorical features
36
+ if categorical_cols:
37
+ encoder = OneHotEncoder(drop='first', sparse_output=False, handle_unknown='ignore')
38
+ encoded_cats = encoder.fit_transform(data[categorical_cols])
39
+ encoded_cats_df = pd.DataFrame(encoded_cats, columns=encoder.get_feature_names_out(categorical_cols))
40
+ data = pd.concat([data[numerical_cols].reset_index(drop=True), encoded_cats_df], axis=1)
41
+
42
+ # Standardize data
43
+ scaler = StandardScaler()
44
+ scaled_data = scaler.fit_transform(data)
45
+
46
+ # Ensure no NaN values exist after transformations
47
+ if np.isnan(scaled_data).any():
48
+ st.error("Data contains NaN values even after preprocessing. Please check your dataset.")
49
+ else:
50
+ # Determine number of clusters using Elbow Method
51
+ st.write("### Elbow Method for Optimal K")
52
+ distortions = []
53
+ K_range = range(1, 11)
54
+ for k in K_range:
55
+ kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
56
+ kmeans.fit(scaled_data)
57
+ distortions.append(kmeans.inertia_)
58
+
59
+ fig, ax = plt.subplots()
60
+ ax.plot(K_range, distortions, marker='o')
61
+ ax.set_xlabel('Number of Clusters')
62
+ ax.set_ylabel('Distortion')
63
+ ax.set_title('Elbow Method')
64
+ st.pyplot(fig)
65
+
66
+ # Choose number of clusters
67
+ k = st.slider("Select Number of Clusters", min_value=2, max_value=10, value=3)
68
+
69
+ # Apply K-Means
70
+ kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
71
+ df['Cluster'] = kmeans.fit_predict(scaled_data)
72
+
73
+ st.write("### Clustered Data")
74
+ st.dataframe(df.head())
75
+
76
+ # PCA for visualization
77
+ pca = PCA(n_components=2)
78
+ pca_result = pca.fit_transform(scaled_data)
79
+ df['PCA1'] = pca_result[:, 0]
80
+ df['PCA2'] = pca_result[:, 1]
81
+
82
+ # Scatter plot of clusters
83
+ st.write("### Cluster Visualization (PCA)")
84
+ fig, ax = plt.subplots()
85
+ sns.scatterplot(x='PCA1', y='PCA2', hue='Cluster', palette='viridis', data=df, ax=ax)
86
+ ax.set_title("Customer Segmentation (PCA Visualization)")
87
+ st.pyplot(fig)
requirements.txt ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ streamlit
2
+ pandas
3
+ numpy
4
+ matplotlib
5
+ seaborn
6
+ scikit-learn