import streamlit as st import pandas as pd from sklearn.preprocessing import StandardScaler, MinMaxScaler from sklearn.decomposition import PCA from sklearn.cluster import KMeans, AgglomerativeClustering, DBSCAN from sklearn.mixture import GaussianMixture from sklearn.metrics import silhouette_score, adjusted_rand_score import matplotlib.pyplot as plt import seaborn as sns import io # Function to load the dataset with st.spinner @st.cache_data # Cache the data to speed up subsequent runs def load_data(): with st.spinner("Loading data..."): df = pd.read_csv("marketing_campaign.csv", delimiter='\t') return df def handle_mixed_types(df): for col in df.columns: unique_types = df[col].apply(type).unique() if len(unique_types) > 1: # Check if there are mixed types # If mixed numeric types (int and float), convert to float if all(issubclass(t, (int, float)) for t in unique_types): df[col] = df[col].astype(float) # Otherwise, convert to string (e.g., for mixed numeric and string types) else: df[col] = df[col].astype(str) return df def handle_nulls(df): for col in df.columns: if df[col].dtype == 'object': df[col] = df[col].fillna(df[col].mode()) # Explicit assignment for categorical else: df[col] = df[col].fillna(df[col].mean()) # Explicit assignment for numerical return df # Function to check data type consistency def check_data_types(df): df['Dt_Customer'] = pd.to_datetime(df['Dt_Customer'], dayfirst=True) return df # Function to visualize data distribution def visualize_data(df): st.subheader("Data Visualization") # Select top 3 columns with highest variance (excluding date and object types) numerical_df = df.select_dtypes(exclude=['object', 'datetime']) # Exclude datetime columns top_3_cols = numerical_df.var().sort_values(ascending=False).head(3).index.tolist() for col in top_3_cols: if df[col].dtype == 'object': plt.figure(figsize=(10, 5)) sns.countplot(x=col, data=df) plt.xticks(rotation=45) # Convert plot to image img = io.BytesIO() plt.savefig(img, format='png') img.seek(0) st.image(img) # Display the image else: plt.figure(figsize=(10, 5)) sns.histplot(x=col, data=df, kde=True) # Convert plot to image img = io.BytesIO() plt.savefig(img, format='png') img.seek(0) st.image(img) # Display the image # Function to preprocess data with PCA def preprocess_data_with_pca(df): st.subheader("Preprocessed Data with PCA") # One-hot encode categorical features categorical_cols = df.select_dtypes(include=['object']).columns.tolist() df_encoded = pd.get_dummies(df, columns=categorical_cols) # Drop 'Response' column for clustering X = df_encoded.drop(columns=['Response']) X['Dt_Customer_Year'] = X['Dt_Customer'].dt.year X['Dt_Customer_Month'] = X['Dt_Customer'].dt.month X = X.drop(columns=['Dt_Customer']) # MinMax scale numerical features scaler = MinMaxScaler() numerical_cols = X.select_dtypes(include=['number']).columns.tolist() X[numerical_cols] = scaler.fit_transform(X[numerical_cols]) # Apply PCA pca = PCA(n_components=0.95) # Retain 95% of variance X_pca = pca.fit_transform(X) st.write(pd.DataFrame(X_pca).head()) return X_pca, df['Response'] # Function to run K-Means clustering def run_kmeans(X, y_true): kmeans = KMeans(n_clusters=5, random_state=42) # Example: 5 clusters y_pred = kmeans.fit_predict(X) n_clusters = kmeans.n_clusters silhouette = silhouette_score(X, y_pred) # Check for number of unique labels before calculating Rand Index if len(set(y_pred)) > 1: rand_index = adjusted_rand_score(y_true, y_pred) else: rand_index = "N/A (Only one cluster found)" return n_clusters, silhouette, rand_index # Function to run Hierarchical clustering def run_hierarchical(X, y_true): hierarchical = AgglomerativeClustering(n_clusters=5) # Example: 5 clusters y_pred = hierarchical.fit_predict(X) n_clusters = hierarchical.n_clusters silhouette = silhouette_score(X, y_pred) # Check for number of unique labels before calculating Rand Index if len(set(y_pred)) > 1: rand_index = adjusted_rand_score(y_true, y_pred) else: rand_index = "N/A (Only one cluster found)" return n_clusters, silhouette, rand_index # Function to run DBSCAN clustering def run_dbscan(X, y_true): dbscan = DBSCAN(eps=0.5, min_samples=5) # Example parameters y_pred = dbscan.fit_predict(X) n_clusters = len(set(y_pred)) - (1 if -1 in y_pred else 0) # Adjust for noise # Check for number of unique labels before calculating Silhouette and Rand Index if n_clusters > 1: silhouette = silhouette_score(X, y_pred) rand_index = adjusted_rand_score(y_true, y_pred) else: silhouette = "N/A (Only one cluster found)" rand_index = "N/A (Only one cluster found)" return n_clusters, silhouette, rand_index # Function to run Gaussian Mixture clustering def run_gaussian_mixture(X, y_true): gaussian_mixture = GaussianMixture(n_components=5, random_state=42) # Example: 5 components y_pred = gaussian_mixture.fit_predict(X) n_clusters = gaussian_mixture.n_components silhouette = silhouette_score(X, y_pred) # Check for number of unique labels before calculating Rand Index if len(set(y_pred)) > 1: rand_index = adjusted_rand_score(y_true, y_pred) else: rand_index = "N/A (Only one cluster found)" return n_clusters, silhouette, rand_index # Main Streamlit app def main(): st.title("Customer Segmentation App") # Load data df = load_data() # Data cleaning and validation df = handle_mixed_types(df) df = handle_nulls(df) df = check_data_types(df) df = handle_mixed_types(df) # Visualize data visualize_data(df) # Preprocess data X_pca, y_true = preprocess_data_with_pca(df) # Create tabs tab1, tab2, tab3, tab4 = st.tabs(["K-Means", "Hierarchical", "DBSCAN", "Gaussian Mixture"]) # Tab 1: K-Means with tab1: n_clusters, silhouette, rand_index = run_kmeans(X_pca, y_true) st.write(f"Number of Clusters: {n_clusters}") st.write(f"Silhouette Score: {silhouette:.3f}") st.write(f"Rand Index: {rand_index}") # Tab 2: Hierarchical with tab2: n_clusters, silhouette, rand_index = run_hierarchical(X_pca, y_true) st.write(f"Number of Clusters: {n_clusters}") st.write(f"Silhouette Score: {silhouette:.3f}") st.write(f"Rand Index: {rand_index}") # Tab 3: DBSCAN with tab3: n_clusters, silhouette, rand_index = run_dbscan(X_pca, y_true) st.write(f"Number of Clusters: {n_clusters}") st.write(f"Silhouette Score: {silhouette}") st.write(f"Rand Index: {rand_index}") # Tab 4: Gaussian Mixture with tab4: n_clusters, silhouette, rand_index = run_gaussian_mixture(X_pca, y_true) st.write(f"Number of Clusters: {n_clusters}") st.write(f"Silhouette Score: {silhouette:.3f}") st.write(f"Rand Index: {rand_index}") if __name__ == "__main__": main()