Spaces:

DhominickJ
/

AppCluster_Prediction

Sleeping

File size: 9,906 Bytes

28a5f7d

import streamlit as st
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.cluster import KMeans, DBSCAN, AgglomerativeClustering
from sklearn.mixture import GaussianMixture
from sklearn.decomposition import PCA
from sklearn.metrics import silhouette_score
import plotly.express as px

# Function to load and preprocess the data
def load_and_preprocess_data(file_uploaded):
    try:
        df = pd.read_csv(file_uploaded)
        df = df.dropna()

        # Encode categorical variables
        le = LabelEncoder()
        categorical_columns = ['Category', 'Content Rating', 'Genres', 'Type']
        for col in categorical_columns:
            df[col + '_encoded'] = le.fit_transform(df[col])
        
        # Replace 'Varies with device' with mean size
        df['Size'] = df['Size'].replace('Varies with device', df[df['Size'] != 'Varies with device']['Size'].mode()[0])

        # Convert 'Size' to numeric
        df['Size'] = df['Size'].apply(lambda x: float(str(x).replace('M', '')) if 'M' in str(x) else float(str(x).replace('k', '')) / 1000)

        # Convert 'Installs' to numeric
        df['Installs'] = df['Installs'].apply(lambda x: int(str(x).replace('+', '').replace(',', '')))

        # Convert 'Price' to numeric
        df['Price'] = df['Price'].apply(lambda x: float(str(x).replace('$', '')))

        # Select relevant features for clustering
        features = ['Category', 'Rating', 'Reviews', 'Size', 'Installs', 'Type', 'Price', 'Content Rating', 'Genres']
        df_features = df[features]
        df = df_features.copy()

        # Separate numerical and encoded categorical features
        numerical_features = ['Rating', 'Reviews', 'Size', 'Installs', 'Price']
        categorical_encoded = [col + '_encoded' for col in categorical_columns]
        
        # Scale only numerical features
        scaler = StandardScaler()
        df_scaled = pd.DataFrame(scaler.fit_transform(df[numerical_features]), columns=numerical_features)
        
        # Add encoded categorical features to scaled data
        for col, base_col in zip(categorical_encoded, categorical_columns):
            df_scaled[col] = le.fit_transform(df[base_col])
        
        scaled_data = df_scaled.values

        return df, scaled_data, scaler
    except Exception as e:
        st.error(f"Error loading and preprocessing data: {e}")

# Function to implement KMeans
def kmeans_clustering(scaled_data, n_clusters):
    kmeans = KMeans(n_clusters=n_clusters, random_state=42)
    kmeans.fit(scaled_data)
    return kmeans.labels_, kmeans

# Function to implement DBSCAN
def dbscan_clustering(scaled_data, eps, min_samples):
    dbscan = DBSCAN(eps=eps, min_samples=min_samples)
    dbscan.fit(scaled_data)
    return dbscan.labels_, dbscan

# Function to implement Agglomerative Clustering
def agglomerative_clustering(scaled_data, n_clusters):
    agglomerative = AgglomerativeClustering(n_clusters=n_clusters)
    agglomerative.fit(scaled_data)
    return agglomerative.labels_, agglomerative

# Function to implement Gaussian Mixture Model
def gaussian_mixture_clustering(scaled_data, n_components):
    gmm = GaussianMixture(n_components=n_components, random_state=42)
    gmm.fit(scaled_data)
    return gmm.predict(scaled_data), gmm

# Function to plot scatter plot
def plot_scatter(df, labels, title, scaled_data):
    pca = PCA(n_components=2)
    reduced_data = pca.fit_transform(scaled_data)
    df_pca = pd.DataFrame(reduced_data, columns=['PC1', 'PC2'])
    df_pca['Cluster'] = labels
    fig = px.scatter(df_pca, x='PC1', y='PC2', color='Cluster', title=title)
    st.plotly_chart(fig)

# Function to plot elbow curve
def plot_elbow_curve(scaled_data, max_clusters):
    wcss = []
    for i in range(1, max_clusters + 1):
        kmeans = KMeans(n_clusters=i, random_state=42)
        kmeans.fit(scaled_data)
        wcss.append(kmeans.inertia_)
    fig, ax = plt.subplots()
    ax.plot(range(1, max_clusters + 1), wcss, marker='o')
    ax.set_title('Elbow Curve')
    ax.set_xlabel('Number of Clusters')
    ax.set_ylabel('WCSS')
    st.pyplot(fig)

# Function to display performance metrics
def display_performance_metrics(labels, scaled_data):
    if len(set(labels)) > 1:
        silhouette = silhouette_score(scaled_data, labels)
        st.write(f"Silhouette Score: {silhouette:.2f}")
    else:
        st.write("Silhouette Score cannot be computed for a single cluster.")

# Define categorical columns globally
categorical_columns = ['Category', 'Content Rating', 'Genres', 'Type']

# Main function
def main():
    st.title("Unsupervised Learning for App Recommendation")

    # File upload
    file = st.sidebar.file_uploader("Upload CSV file", type=["csv"])
    if file is None:
        file = './googleplaystoreapps.csv'
    if file is not None:
        # Sidebar for parameter tuning
        st.sidebar.header("Upload Custom Data Here")
        df, scaled_data, scaler = load_and_preprocess_data(file)
        st.sidebar.header("Parameter Tuning")
        n_clusters = st.sidebar.slider("Number of Clusters", 2, 10, 3)
        eps = st.sidebar.slider("Epsilon (DBSCAN)", 0.1, 1.0, 0.5, 0.1)
        min_samples = st.sidebar.slider("Minimum Samples (DBSCAN)", 1, 10, 5)
        n_components = st.sidebar.slider("Number of Components (GMM)", 2, 10, 3)

        # Tabs for different algorithms
        tab1, tab2, tab3, tab4, tab5 = st.tabs(["KMeans", "DBSCAN", "Agglomerative Clustering", "Gaussian Mixture Model", "Feature Correlation"])

        with tab1:
            st.header("KMeans Clustering")
            labels, kmeans = kmeans_clustering(scaled_data, n_clusters)
            plot_scatter(df, labels, "KMeans Clustering", scaled_data)
            display_performance_metrics(labels, scaled_data)
            plot_elbow_curve(scaled_data, 10)

        with tab2:
            st.header("DBSCAN Clustering")
            labels, dbscan = dbscan_clustering(scaled_data, eps, min_samples)
            plot_scatter(df, labels, "DBSCAN Clustering", scaled_data)
            display_performance_metrics(labels, scaled_data)

        with tab3:
            st.header("Agglomerative Clustering")
            labels, agglomerative = agglomerative_clustering(scaled_data, n_clusters)
            plot_scatter(df, labels, "Agglomerative Clustering", scaled_data)
            display_performance_metrics(labels, scaled_data)

        with tab4:
            st.header("Gaussian Mixture Model")
            labels, gmm = gaussian_mixture_clustering(scaled_data, n_components)
            plot_scatter(df, labels, "Gaussian Mixture Model", scaled_data)
            display_performance_metrics(labels, scaled_data)

        with tab5:
            st.header("Feature Correlation Analysis")
            numerical_df = df.select_dtypes(include=[np.number])
            corr_matrix = numerical_df.corr()
            fig, ax = plt.subplots(figsize=(10, 8))
            sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', ax=ax)
            st.pyplot(fig)

        # User input for prediction
        st.sidebar.header("Input New Data Point")
        new_data = {}
        # Store the original categorical values before encoding
        original_values = {}
        le_dict = {}
        for col in categorical_columns:
            le = LabelEncoder()
            original_values[col] = df[col].unique()
            le_dict[col] = le.fit(original_values[col])

        for col in df.columns:
            if col in categorical_columns:
                # Use original values for display but store encoded value
                selected_value = st.sidebar.selectbox(f"Select {col}", original_values[col])
                new_data[col] = le_dict[col].transform([selected_value])[0]
            else:
                mean_value = np.clip(df[col].mean(), 1.0, 5.0)
                new_data[col] = st.sidebar.number_input(f"Enter {col}", value=float(mean_value))

        new_data_df = pd.DataFrame([new_data])
        # Scale the numerical features of the new data point
        numerical_features = ['Rating', 'Reviews', 'Size', 'Installs', 'Price']
        new_data_numerical = new_data_df[numerical_features]
        new_data_scaled = scaler.transform(new_data_numerical)
        
        # Add encoded categorical features
        new_data_scaled = np.hstack([
            new_data_scaled,
            new_data_df[[col for col in new_data_df.columns if col in categorical_columns]].values
        ])

        # Predict cluster for new data point
        st.sidebar.header("Cluster Prediction")
        if st.sidebar.button("Predict"):
            kmeans_label = kmeans.predict(new_data_scaled)
            dbscan_label = dbscan.fit_predict(new_data_scaled)
            agglomerative_label = [-1]
            gmm_label = gmm.predict(new_data_scaled)
            
            # Find similar apps based on cluster
            kmeans_cluster_apps = df[kmeans.labels_ == kmeans_label[0]]
            gmm_cluster_apps = df[gmm.predict(scaled_data) == gmm_label[0]]

            st.sidebar.write(f"KMeans Cluster: {kmeans_label[0]}")
            st.sidebar.write(f"DBSCAN Cluster: {dbscan_label[0]}")
            st.sidebar.write(f"Agglomerative Cluster: {agglomerative_label[0]}")
            st.sidebar.write(f"GMM Cluster: {gmm_label[0]}")

        # Download results
        st.sidebar.header("Download Results")
        if st.sidebar.button("Download Results"):
            results = pd.DataFrame({
                'Cluster (KMeans)': labels,
                'Cluster (DBSCAN)': dbscan.labels_,
                'Cluster (Agglomerative)': agglomerative.labels_,
                'Cluster (GMM)': gmm.predict(scaled_data)
            })
            st.sidebar.download_button("Download CSV", results.to_csv(index=False), "results.csv")

if __name__ == "__main__":
    main()