Spaces:
Sleeping
Sleeping
| import streamlit as st | |
| import pandas as pd | |
| import numpy as np | |
| import matplotlib.pyplot as plt | |
| import seaborn as sns | |
| from sklearn.preprocessing import StandardScaler, LabelEncoder | |
| from sklearn.cluster import KMeans, DBSCAN, AgglomerativeClustering | |
| from sklearn.mixture import GaussianMixture | |
| from sklearn.decomposition import PCA | |
| from sklearn.metrics import silhouette_score | |
| import plotly.express as px | |
| # Function to load and preprocess the data | |
| def load_and_preprocess_data(file_uploaded): | |
| try: | |
| df = pd.read_csv(file_uploaded) | |
| df = df.dropna() | |
| # Encode categorical variables | |
| le = LabelEncoder() | |
| categorical_columns = ['Category', 'Content Rating', 'Genres', 'Type'] | |
| for col in categorical_columns: | |
| df[col + '_encoded'] = le.fit_transform(df[col]) | |
| # Replace 'Varies with device' with mean size | |
| df['Size'] = df['Size'].replace('Varies with device', df[df['Size'] != 'Varies with device']['Size'].mode()[0]) | |
| # Convert 'Size' to numeric | |
| df['Size'] = df['Size'].apply(lambda x: float(str(x).replace('M', '')) if 'M' in str(x) else float(str(x).replace('k', '')) / 1000) | |
| # Convert 'Installs' to numeric | |
| df['Installs'] = df['Installs'].apply(lambda x: int(str(x).replace('+', '').replace(',', ''))) | |
| # Convert 'Price' to numeric | |
| df['Price'] = df['Price'].apply(lambda x: float(str(x).replace('$', ''))) | |
| # Select relevant features for clustering | |
| features = ['Category', 'Rating', 'Reviews', 'Size', 'Installs', 'Type', 'Price', 'Content Rating', 'Genres'] | |
| df_features = df[features] | |
| df = df_features.copy() | |
| # Separate numerical and encoded categorical features | |
| numerical_features = ['Rating', 'Reviews', 'Size', 'Installs', 'Price'] | |
| categorical_encoded = [col + '_encoded' for col in categorical_columns] | |
| # Scale only numerical features | |
| scaler = StandardScaler() | |
| df_scaled = pd.DataFrame(scaler.fit_transform(df[numerical_features]), columns=numerical_features) | |
| # Add encoded categorical features to scaled data | |
| for col, base_col in zip(categorical_encoded, categorical_columns): | |
| df_scaled[col] = le.fit_transform(df[base_col]) | |
| scaled_data = df_scaled.values | |
| return df, scaled_data, scaler | |
| except Exception as e: | |
| st.error(f"Error loading and preprocessing data: {e}") | |
| # Function to implement KMeans | |
| def kmeans_clustering(scaled_data, n_clusters): | |
| kmeans = KMeans(n_clusters=n_clusters, random_state=42) | |
| kmeans.fit(scaled_data) | |
| return kmeans.labels_, kmeans | |
| # Function to implement DBSCAN | |
| def dbscan_clustering(scaled_data, eps, min_samples): | |
| dbscan = DBSCAN(eps=eps, min_samples=min_samples) | |
| dbscan.fit(scaled_data) | |
| return dbscan.labels_, dbscan | |
| # Function to implement Agglomerative Clustering | |
| def agglomerative_clustering(scaled_data, n_clusters): | |
| agglomerative = AgglomerativeClustering(n_clusters=n_clusters) | |
| agglomerative.fit(scaled_data) | |
| return agglomerative.labels_, agglomerative | |
| # Function to implement Gaussian Mixture Model | |
| def gaussian_mixture_clustering(scaled_data, n_components): | |
| gmm = GaussianMixture(n_components=n_components, random_state=42) | |
| gmm.fit(scaled_data) | |
| return gmm.predict(scaled_data), gmm | |
| # Function to plot scatter plot | |
| def plot_scatter(df, labels, title, scaled_data): | |
| pca = PCA(n_components=2) | |
| reduced_data = pca.fit_transform(scaled_data) | |
| df_pca = pd.DataFrame(reduced_data, columns=['PC1', 'PC2']) | |
| df_pca['Cluster'] = labels | |
| fig = px.scatter(df_pca, x='PC1', y='PC2', color='Cluster', title=title) | |
| st.plotly_chart(fig) | |
| # Function to plot elbow curve | |
| def plot_elbow_curve(scaled_data, max_clusters): | |
| wcss = [] | |
| for i in range(1, max_clusters + 1): | |
| kmeans = KMeans(n_clusters=i, random_state=42) | |
| kmeans.fit(scaled_data) | |
| wcss.append(kmeans.inertia_) | |
| fig, ax = plt.subplots() | |
| ax.plot(range(1, max_clusters + 1), wcss, marker='o') | |
| ax.set_title('Elbow Curve') | |
| ax.set_xlabel('Number of Clusters') | |
| ax.set_ylabel('WCSS') | |
| st.pyplot(fig) | |
| # Function to display performance metrics | |
| def display_performance_metrics(labels, scaled_data): | |
| if len(set(labels)) > 1: | |
| silhouette = silhouette_score(scaled_data, labels) | |
| st.write(f"Silhouette Score: {silhouette:.2f}") | |
| else: | |
| st.write("Silhouette Score cannot be computed for a single cluster.") | |
| # Define categorical columns globally | |
| categorical_columns = ['Category', 'Content Rating', 'Genres', 'Type'] | |
| # Main function | |
| def main(): | |
| st.title("Unsupervised Learning for App Recommendation") | |
| # File upload | |
| file = st.sidebar.file_uploader("Upload CSV file", type=["csv"]) | |
| if file is None: | |
| file = './googleplaystoreapps.csv' | |
| if file is not None: | |
| # Sidebar for parameter tuning | |
| st.sidebar.header("Upload Custom Data Here") | |
| df, scaled_data, scaler = load_and_preprocess_data(file) | |
| st.sidebar.header("Parameter Tuning") | |
| n_clusters = st.sidebar.slider("Number of Clusters", 2, 10, 3) | |
| eps = st.sidebar.slider("Epsilon (DBSCAN)", 0.1, 1.0, 0.5, 0.1) | |
| min_samples = st.sidebar.slider("Minimum Samples (DBSCAN)", 1, 10, 5) | |
| n_components = st.sidebar.slider("Number of Components (GMM)", 2, 10, 3) | |
| # Tabs for different algorithms | |
| tab1, tab2, tab3, tab4, tab5 = st.tabs(["KMeans", "DBSCAN", "Agglomerative Clustering", "Gaussian Mixture Model", "Feature Correlation"]) | |
| with tab1: | |
| st.header("KMeans Clustering") | |
| labels, kmeans = kmeans_clustering(scaled_data, n_clusters) | |
| plot_scatter(df, labels, "KMeans Clustering", scaled_data) | |
| display_performance_metrics(labels, scaled_data) | |
| plot_elbow_curve(scaled_data, 10) | |
| with tab2: | |
| st.header("DBSCAN Clustering") | |
| labels, dbscan = dbscan_clustering(scaled_data, eps, min_samples) | |
| plot_scatter(df, labels, "DBSCAN Clustering", scaled_data) | |
| display_performance_metrics(labels, scaled_data) | |
| with tab3: | |
| st.header("Agglomerative Clustering") | |
| labels, agglomerative = agglomerative_clustering(scaled_data, n_clusters) | |
| plot_scatter(df, labels, "Agglomerative Clustering", scaled_data) | |
| display_performance_metrics(labels, scaled_data) | |
| with tab4: | |
| st.header("Gaussian Mixture Model") | |
| labels, gmm = gaussian_mixture_clustering(scaled_data, n_components) | |
| plot_scatter(df, labels, "Gaussian Mixture Model", scaled_data) | |
| display_performance_metrics(labels, scaled_data) | |
| with tab5: | |
| st.header("Feature Correlation Analysis") | |
| numerical_df = df.select_dtypes(include=[np.number]) | |
| corr_matrix = numerical_df.corr() | |
| fig, ax = plt.subplots(figsize=(10, 8)) | |
| sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', ax=ax) | |
| st.pyplot(fig) | |
| # User input for prediction | |
| st.sidebar.header("Input New Data Point") | |
| new_data = {} | |
| # Store the original categorical values before encoding | |
| original_values = {} | |
| le_dict = {} | |
| for col in categorical_columns: | |
| le = LabelEncoder() | |
| original_values[col] = df[col].unique() | |
| le_dict[col] = le.fit(original_values[col]) | |
| for col in df.columns: | |
| if col in categorical_columns: | |
| # Use original values for display but store encoded value | |
| selected_value = st.sidebar.selectbox(f"Select {col}", original_values[col]) | |
| new_data[col] = le_dict[col].transform([selected_value])[0] | |
| else: | |
| mean_value = np.clip(df[col].mean(), 1.0, 5.0) | |
| new_data[col] = st.sidebar.number_input(f"Enter {col}", value=float(mean_value)) | |
| new_data_df = pd.DataFrame([new_data]) | |
| # Scale the numerical features of the new data point | |
| numerical_features = ['Rating', 'Reviews', 'Size', 'Installs', 'Price'] | |
| new_data_numerical = new_data_df[numerical_features] | |
| new_data_scaled = scaler.transform(new_data_numerical) | |
| # Add encoded categorical features | |
| new_data_scaled = np.hstack([ | |
| new_data_scaled, | |
| new_data_df[[col for col in new_data_df.columns if col in categorical_columns]].values | |
| ]) | |
| # Predict cluster for new data point | |
| st.sidebar.header("Cluster Prediction") | |
| if st.sidebar.button("Predict"): | |
| kmeans_label = kmeans.predict(new_data_scaled) | |
| dbscan_label = dbscan.fit_predict(new_data_scaled) | |
| agglomerative_label = [-1] | |
| gmm_label = gmm.predict(new_data_scaled) | |
| # Find similar apps based on cluster | |
| kmeans_cluster_apps = df[kmeans.labels_ == kmeans_label[0]] | |
| gmm_cluster_apps = df[gmm.predict(scaled_data) == gmm_label[0]] | |
| st.sidebar.write(f"KMeans Cluster: {kmeans_label[0]}") | |
| st.sidebar.write(f"DBSCAN Cluster: {dbscan_label[0]}") | |
| st.sidebar.write(f"Agglomerative Cluster: {agglomerative_label[0]}") | |
| st.sidebar.write(f"GMM Cluster: {gmm_label[0]}") | |
| # Download results | |
| st.sidebar.header("Download Results") | |
| if st.sidebar.button("Download Results"): | |
| results = pd.DataFrame({ | |
| 'Cluster (KMeans)': labels, | |
| 'Cluster (DBSCAN)': dbscan.labels_, | |
| 'Cluster (Agglomerative)': agglomerative.labels_, | |
| 'Cluster (GMM)': gmm.predict(scaled_data) | |
| }) | |
| st.sidebar.download_button("Download CSV", results.to_csv(index=False), "results.csv") | |
| if __name__ == "__main__": | |
| main() |