import streamlit as st import pandas as pd import numpy as np import matplotlib.pyplot as plt import seaborn as sns from sklearn.preprocessing import StandardScaler, LabelEncoder from sklearn.cluster import KMeans, DBSCAN, AgglomerativeClustering from sklearn.mixture import GaussianMixture from sklearn.decomposition import PCA from sklearn.metrics import silhouette_score import plotly.express as px # Function to load and preprocess the data def load_and_preprocess_data(file_uploaded): try: df = pd.read_csv(file_uploaded) df = df.dropna() # Encode categorical variables le = LabelEncoder() categorical_columns = ['Category', 'Content Rating', 'Genres', 'Type'] for col in categorical_columns: df[col + '_encoded'] = le.fit_transform(df[col]) # Replace 'Varies with device' with mean size df['Size'] = df['Size'].replace('Varies with device', df[df['Size'] != 'Varies with device']['Size'].mode()[0]) # Convert 'Size' to numeric df['Size'] = df['Size'].apply(lambda x: float(str(x).replace('M', '')) if 'M' in str(x) else float(str(x).replace('k', '')) / 1000) # Convert 'Installs' to numeric df['Installs'] = df['Installs'].apply(lambda x: int(str(x).replace('+', '').replace(',', ''))) # Convert 'Price' to numeric df['Price'] = df['Price'].apply(lambda x: float(str(x).replace('$', ''))) # Select relevant features for clustering features = ['Category', 'Rating', 'Reviews', 'Size', 'Installs', 'Type', 'Price', 'Content Rating', 'Genres'] df_features = df[features] df = df_features.copy() # Separate numerical and encoded categorical features numerical_features = ['Rating', 'Reviews', 'Size', 'Installs', 'Price'] categorical_encoded = [col + '_encoded' for col in categorical_columns] # Scale only numerical features scaler = StandardScaler() df_scaled = pd.DataFrame(scaler.fit_transform(df[numerical_features]), columns=numerical_features) # Add encoded categorical features to scaled data for col, base_col in zip(categorical_encoded, categorical_columns): df_scaled[col] = le.fit_transform(df[base_col]) scaled_data = df_scaled.values return df, scaled_data, scaler except Exception as e: st.error(f"Error loading and preprocessing data: {e}") # Function to implement KMeans def kmeans_clustering(scaled_data, n_clusters): kmeans = KMeans(n_clusters=n_clusters, random_state=42) kmeans.fit(scaled_data) return kmeans.labels_, kmeans # Function to implement DBSCAN def dbscan_clustering(scaled_data, eps, min_samples): dbscan = DBSCAN(eps=eps, min_samples=min_samples) dbscan.fit(scaled_data) return dbscan.labels_, dbscan # Function to implement Agglomerative Clustering def agglomerative_clustering(scaled_data, n_clusters): agglomerative = AgglomerativeClustering(n_clusters=n_clusters) agglomerative.fit(scaled_data) return agglomerative.labels_, agglomerative # Function to implement Gaussian Mixture Model def gaussian_mixture_clustering(scaled_data, n_components): gmm = GaussianMixture(n_components=n_components, random_state=42) gmm.fit(scaled_data) return gmm.predict(scaled_data), gmm # Function to plot scatter plot def plot_scatter(df, labels, title, scaled_data): pca = PCA(n_components=2) reduced_data = pca.fit_transform(scaled_data) df_pca = pd.DataFrame(reduced_data, columns=['PC1', 'PC2']) df_pca['Cluster'] = labels fig = px.scatter(df_pca, x='PC1', y='PC2', color='Cluster', title=title) st.plotly_chart(fig) # Function to plot elbow curve def plot_elbow_curve(scaled_data, max_clusters): wcss = [] for i in range(1, max_clusters + 1): kmeans = KMeans(n_clusters=i, random_state=42) kmeans.fit(scaled_data) wcss.append(kmeans.inertia_) fig, ax = plt.subplots() ax.plot(range(1, max_clusters + 1), wcss, marker='o') ax.set_title('Elbow Curve') ax.set_xlabel('Number of Clusters') ax.set_ylabel('WCSS') st.pyplot(fig) # Function to display performance metrics def display_performance_metrics(labels, scaled_data): if len(set(labels)) > 1: silhouette = silhouette_score(scaled_data, labels) st.write(f"Silhouette Score: {silhouette:.2f}") else: st.write("Silhouette Score cannot be computed for a single cluster.") # Define categorical columns globally categorical_columns = ['Category', 'Content Rating', 'Genres', 'Type'] # Main function def main(): st.title("Unsupervised Learning for App Recommendation") # File upload file = st.sidebar.file_uploader("Upload CSV file", type=["csv"]) if file is None: file = './googleplaystoreapps.csv' if file is not None: # Sidebar for parameter tuning st.sidebar.header("Upload Custom Data Here") df, scaled_data, scaler = load_and_preprocess_data(file) st.sidebar.header("Parameter Tuning") n_clusters = st.sidebar.slider("Number of Clusters", 2, 10, 3) eps = st.sidebar.slider("Epsilon (DBSCAN)", 0.1, 1.0, 0.5, 0.1) min_samples = st.sidebar.slider("Minimum Samples (DBSCAN)", 1, 10, 5) n_components = st.sidebar.slider("Number of Components (GMM)", 2, 10, 3) # Tabs for different algorithms tab1, tab2, tab3, tab4, tab5 = st.tabs(["KMeans", "DBSCAN", "Agglomerative Clustering", "Gaussian Mixture Model", "Feature Correlation"]) with tab1: st.header("KMeans Clustering") labels, kmeans = kmeans_clustering(scaled_data, n_clusters) plot_scatter(df, labels, "KMeans Clustering", scaled_data) display_performance_metrics(labels, scaled_data) plot_elbow_curve(scaled_data, 10) with tab2: st.header("DBSCAN Clustering") labels, dbscan = dbscan_clustering(scaled_data, eps, min_samples) plot_scatter(df, labels, "DBSCAN Clustering", scaled_data) display_performance_metrics(labels, scaled_data) with tab3: st.header("Agglomerative Clustering") labels, agglomerative = agglomerative_clustering(scaled_data, n_clusters) plot_scatter(df, labels, "Agglomerative Clustering", scaled_data) display_performance_metrics(labels, scaled_data) with tab4: st.header("Gaussian Mixture Model") labels, gmm = gaussian_mixture_clustering(scaled_data, n_components) plot_scatter(df, labels, "Gaussian Mixture Model", scaled_data) display_performance_metrics(labels, scaled_data) with tab5: st.header("Feature Correlation Analysis") numerical_df = df.select_dtypes(include=[np.number]) corr_matrix = numerical_df.corr() fig, ax = plt.subplots(figsize=(10, 8)) sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', ax=ax) st.pyplot(fig) # User input for prediction st.sidebar.header("Input New Data Point") new_data = {} # Store the original categorical values before encoding original_values = {} le_dict = {} for col in categorical_columns: le = LabelEncoder() original_values[col] = df[col].unique() le_dict[col] = le.fit(original_values[col]) for col in df.columns: if col in categorical_columns: # Use original values for display but store encoded value selected_value = st.sidebar.selectbox(f"Select {col}", original_values[col]) new_data[col] = le_dict[col].transform([selected_value])[0] else: mean_value = np.clip(df[col].mean(), 1.0, 5.0) new_data[col] = st.sidebar.number_input(f"Enter {col}", value=float(mean_value)) new_data_df = pd.DataFrame([new_data]) # Scale the numerical features of the new data point numerical_features = ['Rating', 'Reviews', 'Size', 'Installs', 'Price'] new_data_numerical = new_data_df[numerical_features] new_data_scaled = scaler.transform(new_data_numerical) # Add encoded categorical features new_data_scaled = np.hstack([ new_data_scaled, new_data_df[[col for col in new_data_df.columns if col in categorical_columns]].values ]) # Predict cluster for new data point st.sidebar.header("Cluster Prediction") if st.sidebar.button("Predict"): kmeans_label = kmeans.predict(new_data_scaled) dbscan_label = dbscan.fit_predict(new_data_scaled) agglomerative_label = [-1] gmm_label = gmm.predict(new_data_scaled) # Find similar apps based on cluster kmeans_cluster_apps = df[kmeans.labels_ == kmeans_label[0]] gmm_cluster_apps = df[gmm.predict(scaled_data) == gmm_label[0]] st.sidebar.write(f"KMeans Cluster: {kmeans_label[0]}") st.sidebar.write(f"DBSCAN Cluster: {dbscan_label[0]}") st.sidebar.write(f"Agglomerative Cluster: {agglomerative_label[0]}") st.sidebar.write(f"GMM Cluster: {gmm_label[0]}") # Download results st.sidebar.header("Download Results") if st.sidebar.button("Download Results"): results = pd.DataFrame({ 'Cluster (KMeans)': labels, 'Cluster (DBSCAN)': dbscan.labels_, 'Cluster (Agglomerative)': agglomerative.labels_, 'Cluster (GMM)': gmm.predict(scaled_data) }) st.sidebar.download_button("Download CSV", results.to_csv(index=False), "results.csv") if __name__ == "__main__": main()