Spaces:

louiecerv
/

customer_segmentation

Sleeping

App Files Files Community

customer_segmentation / oldapp.py

louiecerv

sync with remote

5c5a3b6 11 months ago

raw

history blame contribute delete

7.45 kB

	import streamlit as st
	import pandas as pd
	from sklearn.preprocessing import StandardScaler, MinMaxScaler
	from sklearn.decomposition import PCA
	from sklearn.cluster import KMeans, AgglomerativeClustering, DBSCAN
	from sklearn.mixture import GaussianMixture
	from sklearn.metrics import silhouette_score, adjusted_rand_score
	import matplotlib.pyplot as plt
	import seaborn as sns
	import io

	# Function to load the dataset with st.spinner
	@st.cache_data # Cache the data to speed up subsequent runs
	def load_data():
	with st.spinner("Loading data..."):
	df = pd.read_csv("marketing_campaign.csv", delimiter='\t')
	return df

	def handle_mixed_types(df):
	for col in df.columns:
	unique_types = df[col].apply(type).unique()
	if len(unique_types) > 1: # Check if there are mixed types
	# If mixed numeric types (int and float), convert to float
	if all(issubclass(t, (int, float)) for t in unique_types):
	df[col] = df[col].astype(float)
	# Otherwise, convert to string (e.g., for mixed numeric and string types)
	else:
	df[col] = df[col].astype(str)
	return df

	def handle_nulls(df):
	for col in df.columns:
	if df[col].dtype == 'object':
	df[col] = df[col].fillna(df[col].mode()) # Explicit assignment for categorical
	else:
	df[col] = df[col].fillna(df[col].mean()) # Explicit assignment for numerical
	return df

	# Function to check data type consistency
	def check_data_types(df):
	df['Dt_Customer'] = pd.to_datetime(df['Dt_Customer'], dayfirst=True)
	return df

	# Function to visualize data distribution
	def visualize_data(df):
	st.subheader("Data Visualization")
	# Select top 3 columns with highest variance (excluding date and object types)
	numerical_df = df.select_dtypes(exclude=['object', 'datetime']) # Exclude datetime columns
	top_3_cols = numerical_df.var().sort_values(ascending=False).head(3).index.tolist()
	for col in top_3_cols:
	if df[col].dtype == 'object':
	plt.figure(figsize=(10, 5))
	sns.countplot(x=col, data=df)
	plt.xticks(rotation=45)
	# Convert plot to image
	img = io.BytesIO()
	plt.savefig(img, format='png')
	img.seek(0)
	st.image(img) # Display the image
	else:
	plt.figure(figsize=(10, 5))
	sns.histplot(x=col, data=df, kde=True)
	# Convert plot to image
	img = io.BytesIO()
	plt.savefig(img, format='png')
	img.seek(0)
	st.image(img) # Display the image

	# Function to preprocess data with PCA
	def preprocess_data_with_pca(df):
	st.subheader("Preprocessed Data with PCA")
	# One-hot encode categorical features
	categorical_cols = df.select_dtypes(include=['object']).columns.tolist()
	df_encoded = pd.get_dummies(df, columns=categorical_cols)
	# Drop 'Response' column for clustering
	X = df_encoded.drop(columns=['Response'])
	X['Dt_Customer_Year'] = X['Dt_Customer'].dt.year
	X['Dt_Customer_Month'] = X['Dt_Customer'].dt.month
	X = X.drop(columns=['Dt_Customer'])
	# MinMax scale numerical features
	scaler = MinMaxScaler()
	numerical_cols = X.select_dtypes(include=['number']).columns.tolist()
	X[numerical_cols] = scaler.fit_transform(X[numerical_cols])
	# Apply PCA
	pca = PCA(n_components=0.95) # Retain 95% of variance
	X_pca = pca.fit_transform(X)
	st.write(pd.DataFrame(X_pca).head())
	return X_pca, df['Response']

	# Function to run K-Means clustering
	def run_kmeans(X, y_true):
	kmeans = KMeans(n_clusters=5, random_state=42) # Example: 5 clusters
	y_pred = kmeans.fit_predict(X)
	n_clusters = kmeans.n_clusters
	silhouette = silhouette_score(X, y_pred)
	# Check for number of unique labels before calculating Rand Index
	if len(set(y_pred)) > 1:
	rand_index = adjusted_rand_score(y_true, y_pred)
	else:
	rand_index = "N/A (Only one cluster found)"
	return n_clusters, silhouette, rand_index

	# Function to run Hierarchical clustering
	def run_hierarchical(X, y_true):
	hierarchical = AgglomerativeClustering(n_clusters=5) # Example: 5 clusters
	y_pred = hierarchical.fit_predict(X)
	n_clusters = hierarchical.n_clusters
	silhouette = silhouette_score(X, y_pred)
	# Check for number of unique labels before calculating Rand Index
	if len(set(y_pred)) > 1:
	rand_index = adjusted_rand_score(y_true, y_pred)
	else:
	rand_index = "N/A (Only one cluster found)"
	return n_clusters, silhouette, rand_index

	# Function to run DBSCAN clustering
	def run_dbscan(X, y_true):
	dbscan = DBSCAN(eps=0.5, min_samples=5) # Example parameters
	y_pred = dbscan.fit_predict(X)
	n_clusters = len(set(y_pred)) - (1 if -1 in y_pred else 0) # Adjust for noise
	# Check for number of unique labels before calculating Silhouette and Rand Index
	if n_clusters > 1:
	silhouette = silhouette_score(X, y_pred)
	rand_index = adjusted_rand_score(y_true, y_pred)
	else:
	silhouette = "N/A (Only one cluster found)"
	rand_index = "N/A (Only one cluster found)"
	return n_clusters, silhouette, rand_index

	# Function to run Gaussian Mixture clustering
	def run_gaussian_mixture(X, y_true):
	gaussian_mixture = GaussianMixture(n_components=5, random_state=42) # Example: 5 components
	y_pred = gaussian_mixture.fit_predict(X)
	n_clusters = gaussian_mixture.n_components
	silhouette = silhouette_score(X, y_pred)
	# Check for number of unique labels before calculating Rand Index
	if len(set(y_pred)) > 1:
	rand_index = adjusted_rand_score(y_true, y_pred)
	else:
	rand_index = "N/A (Only one cluster found)"
	return n_clusters, silhouette, rand_index

	# Main Streamlit app
	def main():
	st.title("Customer Segmentation App")
	# Load data
	df = load_data()
	# Data cleaning and validation
	df = handle_mixed_types(df)
	df = handle_nulls(df)
	df = check_data_types(df)
	df = handle_mixed_types(df)
	# Visualize data
	visualize_data(df)
	# Preprocess data
	X_pca, y_true = preprocess_data_with_pca(df)
	# Create tabs
	tab1, tab2, tab3, tab4 = st.tabs(["K-Means", "Hierarchical", "DBSCAN", "Gaussian Mixture"])
	# Tab 1: K-Means
	with tab1:
	n_clusters, silhouette, rand_index = run_kmeans(X_pca, y_true)
	st.write(f"Number of Clusters: {n_clusters}")
	st.write(f"Silhouette Score: {silhouette:.3f}")
	st.write(f"Rand Index: {rand_index}")
	# Tab 2: Hierarchical
	with tab2:
	n_clusters, silhouette, rand_index = run_hierarchical(X_pca, y_true)
	st.write(f"Number of Clusters: {n_clusters}")
	st.write(f"Silhouette Score: {silhouette:.3f}")
	st.write(f"Rand Index: {rand_index}")
	# Tab 3: DBSCAN
	with tab3:
	n_clusters, silhouette, rand_index = run_dbscan(X_pca, y_true)
	st.write(f"Number of Clusters: {n_clusters}")
	st.write(f"Silhouette Score: {silhouette}")
	st.write(f"Rand Index: {rand_index}")
	# Tab 4: Gaussian Mixture
	with tab4:
	n_clusters, silhouette, rand_index = run_gaussian_mixture(X_pca, y_true)
	st.write(f"Number of Clusters: {n_clusters}")
	st.write(f"Silhouette Score: {silhouette:.3f}")
	st.write(f"Rand Index: {rand_index}")

	if __name__ == "__main__":
	main()