File size: 3,533 Bytes
dd9dd07
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
import streamlit as st
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.decomposition import PCA

# Load dataset
st.title("Customer Segmentation App")

# File uploader
dataset_file = st.file_uploader("Upload your CSV file", type=["csv"])

if dataset_file is not None:
    df = pd.read_csv(dataset_file)
    st.write("### Preview of Uploaded Data:")
    st.dataframe(df.head())
    
    # Drop rows with missing values in the entire dataset
    df.dropna(inplace=True)
    
    # Select features for clustering
    st.write("### Select Features for Clustering")
    selected_features = st.multiselect("Choose features", df.columns.tolist(), default=df.columns.tolist())
    
    if selected_features:
        data = df[selected_features]
        
        # Identify categorical and numerical features
        categorical_cols = data.select_dtypes(include=['object']).columns.tolist()
        numerical_cols = data.select_dtypes(include=['number']).columns.tolist()
        
        # Encode categorical features
        if categorical_cols:
            encoder = OneHotEncoder(drop='first', sparse_output=False, handle_unknown='ignore')
            encoded_cats = encoder.fit_transform(data[categorical_cols])
            encoded_cats_df = pd.DataFrame(encoded_cats, columns=encoder.get_feature_names_out(categorical_cols))
            data = pd.concat([data[numerical_cols].reset_index(drop=True), encoded_cats_df], axis=1)
        
        # Standardize data
        scaler = StandardScaler()
        scaled_data = scaler.fit_transform(data)
        
        # Ensure no NaN values exist after transformations
        if np.isnan(scaled_data).any():
            st.error("Data contains NaN values even after preprocessing. Please check your dataset.")
        else:
            # Determine number of clusters using Elbow Method
            st.write("### Elbow Method for Optimal K")
            distortions = []
            K_range = range(1, 11)
            for k in K_range:
                kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
                kmeans.fit(scaled_data)
                distortions.append(kmeans.inertia_)
            
            fig, ax = plt.subplots()
            ax.plot(K_range, distortions, marker='o')
            ax.set_xlabel('Number of Clusters')
            ax.set_ylabel('Distortion')
            ax.set_title('Elbow Method')
            st.pyplot(fig)
            
            # Choose number of clusters
            k = st.slider("Select Number of Clusters", min_value=2, max_value=10, value=3)
            
            # Apply K-Means
            kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
            df['Cluster'] = kmeans.fit_predict(scaled_data)
            
            st.write("### Clustered Data")
            st.dataframe(df.head())
            
            # PCA for visualization
            pca = PCA(n_components=2)
            pca_result = pca.fit_transform(scaled_data)
            df['PCA1'] = pca_result[:, 0]
            df['PCA2'] = pca_result[:, 1]
            
            # Scatter plot of clusters
            st.write("### Cluster Visualization (PCA)")
            fig, ax = plt.subplots()
            sns.scatterplot(x='PCA1', y='PCA2', hue='Cluster', palette='viridis', data=df, ax=ax)
            ax.set_title("Customer Segmentation (PCA Visualization)")
            st.pyplot(fig)