DhominickJ's picture
Initial Commit for the Mall Customers Prediciton
28a5f7d
import streamlit as st
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.cluster import KMeans, DBSCAN, AgglomerativeClustering
from sklearn.mixture import GaussianMixture
from sklearn.decomposition import PCA
from sklearn.metrics import silhouette_score
import plotly.express as px
# Function to load and preprocess the data
def load_and_preprocess_data(file_uploaded):
try:
df = pd.read_csv(file_uploaded)
df = df.dropna()
# Encode categorical variables
le = LabelEncoder()
categorical_columns = ['Category', 'Content Rating', 'Genres', 'Type']
for col in categorical_columns:
df[col + '_encoded'] = le.fit_transform(df[col])
# Replace 'Varies with device' with mean size
df['Size'] = df['Size'].replace('Varies with device', df[df['Size'] != 'Varies with device']['Size'].mode()[0])
# Convert 'Size' to numeric
df['Size'] = df['Size'].apply(lambda x: float(str(x).replace('M', '')) if 'M' in str(x) else float(str(x).replace('k', '')) / 1000)
# Convert 'Installs' to numeric
df['Installs'] = df['Installs'].apply(lambda x: int(str(x).replace('+', '').replace(',', '')))
# Convert 'Price' to numeric
df['Price'] = df['Price'].apply(lambda x: float(str(x).replace('$', '')))
# Select relevant features for clustering
features = ['Category', 'Rating', 'Reviews', 'Size', 'Installs', 'Type', 'Price', 'Content Rating', 'Genres']
df_features = df[features]
df = df_features.copy()
# Separate numerical and encoded categorical features
numerical_features = ['Rating', 'Reviews', 'Size', 'Installs', 'Price']
categorical_encoded = [col + '_encoded' for col in categorical_columns]
# Scale only numerical features
scaler = StandardScaler()
df_scaled = pd.DataFrame(scaler.fit_transform(df[numerical_features]), columns=numerical_features)
# Add encoded categorical features to scaled data
for col, base_col in zip(categorical_encoded, categorical_columns):
df_scaled[col] = le.fit_transform(df[base_col])
scaled_data = df_scaled.values
return df, scaled_data, scaler
except Exception as e:
st.error(f"Error loading and preprocessing data: {e}")
# Function to implement KMeans
def kmeans_clustering(scaled_data, n_clusters):
kmeans = KMeans(n_clusters=n_clusters, random_state=42)
kmeans.fit(scaled_data)
return kmeans.labels_, kmeans
# Function to implement DBSCAN
def dbscan_clustering(scaled_data, eps, min_samples):
dbscan = DBSCAN(eps=eps, min_samples=min_samples)
dbscan.fit(scaled_data)
return dbscan.labels_, dbscan
# Function to implement Agglomerative Clustering
def agglomerative_clustering(scaled_data, n_clusters):
agglomerative = AgglomerativeClustering(n_clusters=n_clusters)
agglomerative.fit(scaled_data)
return agglomerative.labels_, agglomerative
# Function to implement Gaussian Mixture Model
def gaussian_mixture_clustering(scaled_data, n_components):
gmm = GaussianMixture(n_components=n_components, random_state=42)
gmm.fit(scaled_data)
return gmm.predict(scaled_data), gmm
# Function to plot scatter plot
def plot_scatter(df, labels, title, scaled_data):
pca = PCA(n_components=2)
reduced_data = pca.fit_transform(scaled_data)
df_pca = pd.DataFrame(reduced_data, columns=['PC1', 'PC2'])
df_pca['Cluster'] = labels
fig = px.scatter(df_pca, x='PC1', y='PC2', color='Cluster', title=title)
st.plotly_chart(fig)
# Function to plot elbow curve
def plot_elbow_curve(scaled_data, max_clusters):
wcss = []
for i in range(1, max_clusters + 1):
kmeans = KMeans(n_clusters=i, random_state=42)
kmeans.fit(scaled_data)
wcss.append(kmeans.inertia_)
fig, ax = plt.subplots()
ax.plot(range(1, max_clusters + 1), wcss, marker='o')
ax.set_title('Elbow Curve')
ax.set_xlabel('Number of Clusters')
ax.set_ylabel('WCSS')
st.pyplot(fig)
# Function to display performance metrics
def display_performance_metrics(labels, scaled_data):
if len(set(labels)) > 1:
silhouette = silhouette_score(scaled_data, labels)
st.write(f"Silhouette Score: {silhouette:.2f}")
else:
st.write("Silhouette Score cannot be computed for a single cluster.")
# Define categorical columns globally
categorical_columns = ['Category', 'Content Rating', 'Genres', 'Type']
# Main function
def main():
st.title("Unsupervised Learning for App Recommendation")
# File upload
file = st.sidebar.file_uploader("Upload CSV file", type=["csv"])
if file is None:
file = './googleplaystoreapps.csv'
if file is not None:
# Sidebar for parameter tuning
st.sidebar.header("Upload Custom Data Here")
df, scaled_data, scaler = load_and_preprocess_data(file)
st.sidebar.header("Parameter Tuning")
n_clusters = st.sidebar.slider("Number of Clusters", 2, 10, 3)
eps = st.sidebar.slider("Epsilon (DBSCAN)", 0.1, 1.0, 0.5, 0.1)
min_samples = st.sidebar.slider("Minimum Samples (DBSCAN)", 1, 10, 5)
n_components = st.sidebar.slider("Number of Components (GMM)", 2, 10, 3)
# Tabs for different algorithms
tab1, tab2, tab3, tab4, tab5 = st.tabs(["KMeans", "DBSCAN", "Agglomerative Clustering", "Gaussian Mixture Model", "Feature Correlation"])
with tab1:
st.header("KMeans Clustering")
labels, kmeans = kmeans_clustering(scaled_data, n_clusters)
plot_scatter(df, labels, "KMeans Clustering", scaled_data)
display_performance_metrics(labels, scaled_data)
plot_elbow_curve(scaled_data, 10)
with tab2:
st.header("DBSCAN Clustering")
labels, dbscan = dbscan_clustering(scaled_data, eps, min_samples)
plot_scatter(df, labels, "DBSCAN Clustering", scaled_data)
display_performance_metrics(labels, scaled_data)
with tab3:
st.header("Agglomerative Clustering")
labels, agglomerative = agglomerative_clustering(scaled_data, n_clusters)
plot_scatter(df, labels, "Agglomerative Clustering", scaled_data)
display_performance_metrics(labels, scaled_data)
with tab4:
st.header("Gaussian Mixture Model")
labels, gmm = gaussian_mixture_clustering(scaled_data, n_components)
plot_scatter(df, labels, "Gaussian Mixture Model", scaled_data)
display_performance_metrics(labels, scaled_data)
with tab5:
st.header("Feature Correlation Analysis")
numerical_df = df.select_dtypes(include=[np.number])
corr_matrix = numerical_df.corr()
fig, ax = plt.subplots(figsize=(10, 8))
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', ax=ax)
st.pyplot(fig)
# User input for prediction
st.sidebar.header("Input New Data Point")
new_data = {}
# Store the original categorical values before encoding
original_values = {}
le_dict = {}
for col in categorical_columns:
le = LabelEncoder()
original_values[col] = df[col].unique()
le_dict[col] = le.fit(original_values[col])
for col in df.columns:
if col in categorical_columns:
# Use original values for display but store encoded value
selected_value = st.sidebar.selectbox(f"Select {col}", original_values[col])
new_data[col] = le_dict[col].transform([selected_value])[0]
else:
mean_value = np.clip(df[col].mean(), 1.0, 5.0)
new_data[col] = st.sidebar.number_input(f"Enter {col}", value=float(mean_value))
new_data_df = pd.DataFrame([new_data])
# Scale the numerical features of the new data point
numerical_features = ['Rating', 'Reviews', 'Size', 'Installs', 'Price']
new_data_numerical = new_data_df[numerical_features]
new_data_scaled = scaler.transform(new_data_numerical)
# Add encoded categorical features
new_data_scaled = np.hstack([
new_data_scaled,
new_data_df[[col for col in new_data_df.columns if col in categorical_columns]].values
])
# Predict cluster for new data point
st.sidebar.header("Cluster Prediction")
if st.sidebar.button("Predict"):
kmeans_label = kmeans.predict(new_data_scaled)
dbscan_label = dbscan.fit_predict(new_data_scaled)
agglomerative_label = [-1]
gmm_label = gmm.predict(new_data_scaled)
# Find similar apps based on cluster
kmeans_cluster_apps = df[kmeans.labels_ == kmeans_label[0]]
gmm_cluster_apps = df[gmm.predict(scaled_data) == gmm_label[0]]
st.sidebar.write(f"KMeans Cluster: {kmeans_label[0]}")
st.sidebar.write(f"DBSCAN Cluster: {dbscan_label[0]}")
st.sidebar.write(f"Agglomerative Cluster: {agglomerative_label[0]}")
st.sidebar.write(f"GMM Cluster: {gmm_label[0]}")
# Download results
st.sidebar.header("Download Results")
if st.sidebar.button("Download Results"):
results = pd.DataFrame({
'Cluster (KMeans)': labels,
'Cluster (DBSCAN)': dbscan.labels_,
'Cluster (Agglomerative)': agglomerative.labels_,
'Cluster (GMM)': gmm.predict(scaled_data)
})
st.sidebar.download_button("Download CSV", results.to_csv(index=False), "results.csv")
if __name__ == "__main__":
main()