louiecerv's picture
sync with remote
5c5a3b6
import streamlit as st
import pandas as pd
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans, AgglomerativeClustering, DBSCAN
from sklearn.mixture import GaussianMixture
from sklearn.metrics import silhouette_score, adjusted_rand_score
import matplotlib.pyplot as plt
import seaborn as sns
import io
# Function to load the dataset with st.spinner
@st.cache_data # Cache the data to speed up subsequent runs
def load_data():
with st.spinner("Loading data..."):
df = pd.read_csv("marketing_campaign.csv", delimiter='\t')
return df
def handle_mixed_types(df):
for col in df.columns:
unique_types = df[col].apply(type).unique()
if len(unique_types) > 1: # Check if there are mixed types
# If mixed numeric types (int and float), convert to float
if all(issubclass(t, (int, float)) for t in unique_types):
df[col] = df[col].astype(float)
# Otherwise, convert to string (e.g., for mixed numeric and string types)
else:
df[col] = df[col].astype(str)
return df
def handle_nulls(df):
for col in df.columns:
if df[col].dtype == 'object':
df[col] = df[col].fillna(df[col].mode()) # Explicit assignment for categorical
else:
df[col] = df[col].fillna(df[col].mean()) # Explicit assignment for numerical
return df
# Function to check data type consistency
def check_data_types(df):
df['Dt_Customer'] = pd.to_datetime(df['Dt_Customer'], dayfirst=True)
return df
# Function to visualize data distribution
def visualize_data(df):
st.subheader("Data Visualization")
# Select top 3 columns with highest variance (excluding date and object types)
numerical_df = df.select_dtypes(exclude=['object', 'datetime']) # Exclude datetime columns
top_3_cols = numerical_df.var().sort_values(ascending=False).head(3).index.tolist()
for col in top_3_cols:
if df[col].dtype == 'object':
plt.figure(figsize=(10, 5))
sns.countplot(x=col, data=df)
plt.xticks(rotation=45)
# Convert plot to image
img = io.BytesIO()
plt.savefig(img, format='png')
img.seek(0)
st.image(img) # Display the image
else:
plt.figure(figsize=(10, 5))
sns.histplot(x=col, data=df, kde=True)
# Convert plot to image
img = io.BytesIO()
plt.savefig(img, format='png')
img.seek(0)
st.image(img) # Display the image
# Function to preprocess data with PCA
def preprocess_data_with_pca(df):
st.subheader("Preprocessed Data with PCA")
# One-hot encode categorical features
categorical_cols = df.select_dtypes(include=['object']).columns.tolist()
df_encoded = pd.get_dummies(df, columns=categorical_cols)
# Drop 'Response' column for clustering
X = df_encoded.drop(columns=['Response'])
X['Dt_Customer_Year'] = X['Dt_Customer'].dt.year
X['Dt_Customer_Month'] = X['Dt_Customer'].dt.month
X = X.drop(columns=['Dt_Customer'])
# MinMax scale numerical features
scaler = MinMaxScaler()
numerical_cols = X.select_dtypes(include=['number']).columns.tolist()
X[numerical_cols] = scaler.fit_transform(X[numerical_cols])
# Apply PCA
pca = PCA(n_components=0.95) # Retain 95% of variance
X_pca = pca.fit_transform(X)
st.write(pd.DataFrame(X_pca).head())
return X_pca, df['Response']
# Function to run K-Means clustering
def run_kmeans(X, y_true):
kmeans = KMeans(n_clusters=5, random_state=42) # Example: 5 clusters
y_pred = kmeans.fit_predict(X)
n_clusters = kmeans.n_clusters
silhouette = silhouette_score(X, y_pred)
# Check for number of unique labels before calculating Rand Index
if len(set(y_pred)) > 1:
rand_index = adjusted_rand_score(y_true, y_pred)
else:
rand_index = "N/A (Only one cluster found)"
return n_clusters, silhouette, rand_index
# Function to run Hierarchical clustering
def run_hierarchical(X, y_true):
hierarchical = AgglomerativeClustering(n_clusters=5) # Example: 5 clusters
y_pred = hierarchical.fit_predict(X)
n_clusters = hierarchical.n_clusters
silhouette = silhouette_score(X, y_pred)
# Check for number of unique labels before calculating Rand Index
if len(set(y_pred)) > 1:
rand_index = adjusted_rand_score(y_true, y_pred)
else:
rand_index = "N/A (Only one cluster found)"
return n_clusters, silhouette, rand_index
# Function to run DBSCAN clustering
def run_dbscan(X, y_true):
dbscan = DBSCAN(eps=0.5, min_samples=5) # Example parameters
y_pred = dbscan.fit_predict(X)
n_clusters = len(set(y_pred)) - (1 if -1 in y_pred else 0) # Adjust for noise
# Check for number of unique labels before calculating Silhouette and Rand Index
if n_clusters > 1:
silhouette = silhouette_score(X, y_pred)
rand_index = adjusted_rand_score(y_true, y_pred)
else:
silhouette = "N/A (Only one cluster found)"
rand_index = "N/A (Only one cluster found)"
return n_clusters, silhouette, rand_index
# Function to run Gaussian Mixture clustering
def run_gaussian_mixture(X, y_true):
gaussian_mixture = GaussianMixture(n_components=5, random_state=42) # Example: 5 components
y_pred = gaussian_mixture.fit_predict(X)
n_clusters = gaussian_mixture.n_components
silhouette = silhouette_score(X, y_pred)
# Check for number of unique labels before calculating Rand Index
if len(set(y_pred)) > 1:
rand_index = adjusted_rand_score(y_true, y_pred)
else:
rand_index = "N/A (Only one cluster found)"
return n_clusters, silhouette, rand_index
# Main Streamlit app
def main():
st.title("Customer Segmentation App")
# Load data
df = load_data()
# Data cleaning and validation
df = handle_mixed_types(df)
df = handle_nulls(df)
df = check_data_types(df)
df = handle_mixed_types(df)
# Visualize data
visualize_data(df)
# Preprocess data
X_pca, y_true = preprocess_data_with_pca(df)
# Create tabs
tab1, tab2, tab3, tab4 = st.tabs(["K-Means", "Hierarchical", "DBSCAN", "Gaussian Mixture"])
# Tab 1: K-Means
with tab1:
n_clusters, silhouette, rand_index = run_kmeans(X_pca, y_true)
st.write(f"Number of Clusters: {n_clusters}")
st.write(f"Silhouette Score: {silhouette:.3f}")
st.write(f"Rand Index: {rand_index}")
# Tab 2: Hierarchical
with tab2:
n_clusters, silhouette, rand_index = run_hierarchical(X_pca, y_true)
st.write(f"Number of Clusters: {n_clusters}")
st.write(f"Silhouette Score: {silhouette:.3f}")
st.write(f"Rand Index: {rand_index}")
# Tab 3: DBSCAN
with tab3:
n_clusters, silhouette, rand_index = run_dbscan(X_pca, y_true)
st.write(f"Number of Clusters: {n_clusters}")
st.write(f"Silhouette Score: {silhouette}")
st.write(f"Rand Index: {rand_index}")
# Tab 4: Gaussian Mixture
with tab4:
n_clusters, silhouette, rand_index = run_gaussian_mixture(X_pca, y_true)
st.write(f"Number of Clusters: {n_clusters}")
st.write(f"Silhouette Score: {silhouette:.3f}")
st.write(f"Rand Index: {rand_index}")
if __name__ == "__main__":
main()