Spaces:

DhominickJ
/

AppCluster_Prediction

Sleeping

App Files Files Community

AppCluster_Prediction / app.py

DhominickJ

Initial Commit for the Mall Customers Prediciton

28a5f7d 10 months ago

raw

history blame contribute delete

9.91 kB

	import streamlit as st
	import pandas as pd
	import numpy as np
	import matplotlib.pyplot as plt
	import seaborn as sns
	from sklearn.preprocessing import StandardScaler, LabelEncoder
	from sklearn.cluster import KMeans, DBSCAN, AgglomerativeClustering
	from sklearn.mixture import GaussianMixture
	from sklearn.decomposition import PCA
	from sklearn.metrics import silhouette_score
	import plotly.express as px

	# Function to load and preprocess the data
	def load_and_preprocess_data(file_uploaded):
	try:
	df = pd.read_csv(file_uploaded)
	df = df.dropna()

	# Encode categorical variables
	le = LabelEncoder()
	categorical_columns = ['Category', 'Content Rating', 'Genres', 'Type']
	for col in categorical_columns:
	df[col + '_encoded'] = le.fit_transform(df[col])

	# Replace 'Varies with device' with mean size
	df['Size'] = df['Size'].replace('Varies with device', df[df['Size'] != 'Varies with device']['Size'].mode()[0])

	# Convert 'Size' to numeric
	df['Size'] = df['Size'].apply(lambda x: float(str(x).replace('M', '')) if 'M' in str(x) else float(str(x).replace('k', '')) / 1000)

	# Convert 'Installs' to numeric
	df['Installs'] = df['Installs'].apply(lambda x: int(str(x).replace('+', '').replace(',', '')))

	# Convert 'Price' to numeric
	df['Price'] = df['Price'].apply(lambda x: float(str(x).replace('$', '')))

	# Select relevant features for clustering
	features = ['Category', 'Rating', 'Reviews', 'Size', 'Installs', 'Type', 'Price', 'Content Rating', 'Genres']
	df_features = df[features]
	df = df_features.copy()

	# Separate numerical and encoded categorical features
	numerical_features = ['Rating', 'Reviews', 'Size', 'Installs', 'Price']
	categorical_encoded = [col + '_encoded' for col in categorical_columns]

	# Scale only numerical features
	scaler = StandardScaler()
	df_scaled = pd.DataFrame(scaler.fit_transform(df[numerical_features]), columns=numerical_features)

	# Add encoded categorical features to scaled data
	for col, base_col in zip(categorical_encoded, categorical_columns):
	df_scaled[col] = le.fit_transform(df[base_col])

	scaled_data = df_scaled.values

	return df, scaled_data, scaler
	except Exception as e:
	st.error(f"Error loading and preprocessing data: {e}")

	# Function to implement KMeans
	def kmeans_clustering(scaled_data, n_clusters):
	kmeans = KMeans(n_clusters=n_clusters, random_state=42)
	kmeans.fit(scaled_data)
	return kmeans.labels_, kmeans

	# Function to implement DBSCAN
	def dbscan_clustering(scaled_data, eps, min_samples):
	dbscan = DBSCAN(eps=eps, min_samples=min_samples)
	dbscan.fit(scaled_data)
	return dbscan.labels_, dbscan

	# Function to implement Agglomerative Clustering
	def agglomerative_clustering(scaled_data, n_clusters):
	agglomerative = AgglomerativeClustering(n_clusters=n_clusters)
	agglomerative.fit(scaled_data)
	return agglomerative.labels_, agglomerative

	# Function to implement Gaussian Mixture Model
	def gaussian_mixture_clustering(scaled_data, n_components):
	gmm = GaussianMixture(n_components=n_components, random_state=42)
	gmm.fit(scaled_data)
	return gmm.predict(scaled_data), gmm

	# Function to plot scatter plot
	def plot_scatter(df, labels, title, scaled_data):
	pca = PCA(n_components=2)
	reduced_data = pca.fit_transform(scaled_data)
	df_pca = pd.DataFrame(reduced_data, columns=['PC1', 'PC2'])
	df_pca['Cluster'] = labels
	fig = px.scatter(df_pca, x='PC1', y='PC2', color='Cluster', title=title)
	st.plotly_chart(fig)

	# Function to plot elbow curve
	def plot_elbow_curve(scaled_data, max_clusters):
	wcss = []
	for i in range(1, max_clusters + 1):
	kmeans = KMeans(n_clusters=i, random_state=42)
	kmeans.fit(scaled_data)
	wcss.append(kmeans.inertia_)
	fig, ax = plt.subplots()
	ax.plot(range(1, max_clusters + 1), wcss, marker='o')
	ax.set_title('Elbow Curve')
	ax.set_xlabel('Number of Clusters')
	ax.set_ylabel('WCSS')
	st.pyplot(fig)

	# Function to display performance metrics
	def display_performance_metrics(labels, scaled_data):
	if len(set(labels)) > 1:
	silhouette = silhouette_score(scaled_data, labels)
	st.write(f"Silhouette Score: {silhouette:.2f}")
	else:
	st.write("Silhouette Score cannot be computed for a single cluster.")

	# Define categorical columns globally
	categorical_columns = ['Category', 'Content Rating', 'Genres', 'Type']

	# Main function
	def main():
	st.title("Unsupervised Learning for App Recommendation")

	# File upload
	file = st.sidebar.file_uploader("Upload CSV file", type=["csv"])
	if file is None:
	file = './googleplaystoreapps.csv'
	if file is not None:
	# Sidebar for parameter tuning
	st.sidebar.header("Upload Custom Data Here")
	df, scaled_data, scaler = load_and_preprocess_data(file)
	st.sidebar.header("Parameter Tuning")
	n_clusters = st.sidebar.slider("Number of Clusters", 2, 10, 3)
	eps = st.sidebar.slider("Epsilon (DBSCAN)", 0.1, 1.0, 0.5, 0.1)
	min_samples = st.sidebar.slider("Minimum Samples (DBSCAN)", 1, 10, 5)
	n_components = st.sidebar.slider("Number of Components (GMM)", 2, 10, 3)

	# Tabs for different algorithms
	tab1, tab2, tab3, tab4, tab5 = st.tabs(["KMeans", "DBSCAN", "Agglomerative Clustering", "Gaussian Mixture Model", "Feature Correlation"])

	with tab1:
	st.header("KMeans Clustering")
	labels, kmeans = kmeans_clustering(scaled_data, n_clusters)
	plot_scatter(df, labels, "KMeans Clustering", scaled_data)
	display_performance_metrics(labels, scaled_data)
	plot_elbow_curve(scaled_data, 10)

	with tab2:
	st.header("DBSCAN Clustering")
	labels, dbscan = dbscan_clustering(scaled_data, eps, min_samples)
	plot_scatter(df, labels, "DBSCAN Clustering", scaled_data)
	display_performance_metrics(labels, scaled_data)

	with tab3:
	st.header("Agglomerative Clustering")
	labels, agglomerative = agglomerative_clustering(scaled_data, n_clusters)
	plot_scatter(df, labels, "Agglomerative Clustering", scaled_data)
	display_performance_metrics(labels, scaled_data)

	with tab4:
	st.header("Gaussian Mixture Model")
	labels, gmm = gaussian_mixture_clustering(scaled_data, n_components)
	plot_scatter(df, labels, "Gaussian Mixture Model", scaled_data)
	display_performance_metrics(labels, scaled_data)

	with tab5:
	st.header("Feature Correlation Analysis")
	numerical_df = df.select_dtypes(include=[np.number])
	corr_matrix = numerical_df.corr()
	fig, ax = plt.subplots(figsize=(10, 8))
	sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', ax=ax)
	st.pyplot(fig)

	# User input for prediction
	st.sidebar.header("Input New Data Point")
	new_data = {}
	# Store the original categorical values before encoding
	original_values = {}
	le_dict = {}
	for col in categorical_columns:
	le = LabelEncoder()
	original_values[col] = df[col].unique()
	le_dict[col] = le.fit(original_values[col])

	for col in df.columns:
	if col in categorical_columns:
	# Use original values for display but store encoded value
	selected_value = st.sidebar.selectbox(f"Select {col}", original_values[col])
	new_data[col] = le_dict[col].transform([selected_value])[0]
	else:
	mean_value = np.clip(df[col].mean(), 1.0, 5.0)
	new_data[col] = st.sidebar.number_input(f"Enter {col}", value=float(mean_value))

	new_data_df = pd.DataFrame([new_data])
	# Scale the numerical features of the new data point
	numerical_features = ['Rating', 'Reviews', 'Size', 'Installs', 'Price']
	new_data_numerical = new_data_df[numerical_features]
	new_data_scaled = scaler.transform(new_data_numerical)

	# Add encoded categorical features
	new_data_scaled = np.hstack([
	new_data_scaled,
	new_data_df[[col for col in new_data_df.columns if col in categorical_columns]].values
	])

	# Predict cluster for new data point
	st.sidebar.header("Cluster Prediction")
	if st.sidebar.button("Predict"):
	kmeans_label = kmeans.predict(new_data_scaled)
	dbscan_label = dbscan.fit_predict(new_data_scaled)
	agglomerative_label = [-1]
	gmm_label = gmm.predict(new_data_scaled)

	# Find similar apps based on cluster
	kmeans_cluster_apps = df[kmeans.labels_ == kmeans_label[0]]
	gmm_cluster_apps = df[gmm.predict(scaled_data) == gmm_label[0]]

	st.sidebar.write(f"KMeans Cluster: {kmeans_label[0]}")
	st.sidebar.write(f"DBSCAN Cluster: {dbscan_label[0]}")
	st.sidebar.write(f"Agglomerative Cluster: {agglomerative_label[0]}")
	st.sidebar.write(f"GMM Cluster: {gmm_label[0]}")

	# Download results
	st.sidebar.header("Download Results")
	if st.sidebar.button("Download Results"):
	results = pd.DataFrame({
	'Cluster (KMeans)': labels,
	'Cluster (DBSCAN)': dbscan.labels_,
	'Cluster (Agglomerative)': agglomerative.labels_,
	'Cluster (GMM)': gmm.predict(scaled_data)
	})
	st.sidebar.download_button("Download CSV", results.to_csv(index=False), "results.csv")

	if __name__ == "__main__":
	main()