Spaces:

DhominickJ
/

AppCluster_Prediction

Sleeping

App Files Files Community

DhominickJ commited on Mar 2, 2025

Commit

28a5f7d

1 Parent(s): a0f7bfa

Initial Commit for the Mall Customers Prediciton

Browse files

Files changed (3) hide show

app.py +233 -0
googleplaystoreapps.csv +0 -0
requirements.txt +6 -0

app.py ADDED Viewed

	@@ -0,0 +1,233 @@

+import streamlit as st
+import pandas as pd
+import numpy as np
+import matplotlib.pyplot as plt
+import seaborn as sns
+from sklearn.preprocessing import StandardScaler, LabelEncoder
+from sklearn.cluster import KMeans, DBSCAN, AgglomerativeClustering
+from sklearn.mixture import GaussianMixture
+from sklearn.decomposition import PCA
+from sklearn.metrics import silhouette_score
+import plotly.express as px
+# Function to load and preprocess the data
+def load_and_preprocess_data(file_uploaded):
+    try:
+        df = pd.read_csv(file_uploaded)
+        df = df.dropna()
+        # Encode categorical variables
+        le = LabelEncoder()
+        categorical_columns = ['Category', 'Content Rating', 'Genres', 'Type']
+        for col in categorical_columns:
+            df[col + '_encoded'] = le.fit_transform(df[col])
+        # Replace 'Varies with device' with mean size
+        df['Size'] = df['Size'].replace('Varies with device', df[df['Size'] != 'Varies with device']['Size'].mode()[0])
+        # Convert 'Size' to numeric
+        df['Size'] = df['Size'].apply(lambda x: float(str(x).replace('M', '')) if 'M' in str(x) else float(str(x).replace('k', '')) / 1000)
+        # Convert 'Installs' to numeric
+        df['Installs'] = df['Installs'].apply(lambda x: int(str(x).replace('+', '').replace(',', '')))
+        # Convert 'Price' to numeric
+        df['Price'] = df['Price'].apply(lambda x: float(str(x).replace('$', '')))
+        # Select relevant features for clustering
+        features = ['Category', 'Rating', 'Reviews', 'Size', 'Installs', 'Type', 'Price', 'Content Rating', 'Genres']
+        df_features = df[features]
+        df = df_features.copy()
+        # Separate numerical and encoded categorical features
+        numerical_features = ['Rating', 'Reviews', 'Size', 'Installs', 'Price']
+        categorical_encoded = [col + '_encoded' for col in categorical_columns]
+        # Scale only numerical features
+        scaler = StandardScaler()
+        df_scaled = pd.DataFrame(scaler.fit_transform(df[numerical_features]), columns=numerical_features)
+        # Add encoded categorical features to scaled data
+        for col, base_col in zip(categorical_encoded, categorical_columns):
+            df_scaled[col] = le.fit_transform(df[base_col])
+        scaled_data = df_scaled.values
+        return df, scaled_data, scaler
+    except Exception as e:
+        st.error(f"Error loading and preprocessing data: {e}")
+# Function to implement KMeans
+def kmeans_clustering(scaled_data, n_clusters):
+    kmeans = KMeans(n_clusters=n_clusters, random_state=42)
+    kmeans.fit(scaled_data)
+    return kmeans.labels_, kmeans
+# Function to implement DBSCAN
+def dbscan_clustering(scaled_data, eps, min_samples):
+    dbscan = DBSCAN(eps=eps, min_samples=min_samples)
+    dbscan.fit(scaled_data)
+    return dbscan.labels_, dbscan
+# Function to implement Agglomerative Clustering
+def agglomerative_clustering(scaled_data, n_clusters):
+    agglomerative = AgglomerativeClustering(n_clusters=n_clusters)
+    agglomerative.fit(scaled_data)
+    return agglomerative.labels_, agglomerative
+# Function to implement Gaussian Mixture Model
+def gaussian_mixture_clustering(scaled_data, n_components):
+    gmm = GaussianMixture(n_components=n_components, random_state=42)
+    gmm.fit(scaled_data)
+    return gmm.predict(scaled_data), gmm
+# Function to plot scatter plot
+def plot_scatter(df, labels, title, scaled_data):
+    pca = PCA(n_components=2)
+    reduced_data = pca.fit_transform(scaled_data)
+    df_pca = pd.DataFrame(reduced_data, columns=['PC1', 'PC2'])
+    df_pca['Cluster'] = labels
+    fig = px.scatter(df_pca, x='PC1', y='PC2', color='Cluster', title=title)
+    st.plotly_chart(fig)
+# Function to plot elbow curve
+def plot_elbow_curve(scaled_data, max_clusters):
+    wcss = []
+    for i in range(1, max_clusters + 1):
+        kmeans = KMeans(n_clusters=i, random_state=42)
+        kmeans.fit(scaled_data)
+        wcss.append(kmeans.inertia_)
+    fig, ax = plt.subplots()
+    ax.plot(range(1, max_clusters + 1), wcss, marker='o')
+    ax.set_title('Elbow Curve')
+    ax.set_xlabel('Number of Clusters')
+    ax.set_ylabel('WCSS')
+    st.pyplot(fig)
+# Function to display performance metrics
+def display_performance_metrics(labels, scaled_data):
+    if len(set(labels)) > 1:
+        silhouette = silhouette_score(scaled_data, labels)
+        st.write(f"Silhouette Score: {silhouette:.2f}")
+    else:
+        st.write("Silhouette Score cannot be computed for a single cluster.")
+# Define categorical columns globally
+categorical_columns = ['Category', 'Content Rating', 'Genres', 'Type']
+# Main function
+def main():
+    st.title("Unsupervised Learning for App Recommendation")
+    # File upload
+    file = st.sidebar.file_uploader("Upload CSV file", type=["csv"])
+    if file is None:
+        file = './googleplaystoreapps.csv'
+    if file is not None:
+        # Sidebar for parameter tuning
+        st.sidebar.header("Upload Custom Data Here")
+        df, scaled_data, scaler = load_and_preprocess_data(file)
+        st.sidebar.header("Parameter Tuning")
+        n_clusters = st.sidebar.slider("Number of Clusters", 2, 10, 3)
+        eps = st.sidebar.slider("Epsilon (DBSCAN)", 0.1, 1.0, 0.5, 0.1)
+        min_samples = st.sidebar.slider("Minimum Samples (DBSCAN)", 1, 10, 5)
+        n_components = st.sidebar.slider("Number of Components (GMM)", 2, 10, 3)
+        # Tabs for different algorithms
+        tab1, tab2, tab3, tab4, tab5 = st.tabs(["KMeans", "DBSCAN", "Agglomerative Clustering", "Gaussian Mixture Model", "Feature Correlation"])
+        with tab1:
+            st.header("KMeans Clustering")
+            labels, kmeans = kmeans_clustering(scaled_data, n_clusters)
+            plot_scatter(df, labels, "KMeans Clustering", scaled_data)
+            display_performance_metrics(labels, scaled_data)
+            plot_elbow_curve(scaled_data, 10)
+        with tab2:
+            st.header("DBSCAN Clustering")
+            labels, dbscan = dbscan_clustering(scaled_data, eps, min_samples)
+            plot_scatter(df, labels, "DBSCAN Clustering", scaled_data)
+            display_performance_metrics(labels, scaled_data)
+        with tab3:
+            st.header("Agglomerative Clustering")
+            labels, agglomerative = agglomerative_clustering(scaled_data, n_clusters)
+            plot_scatter(df, labels, "Agglomerative Clustering", scaled_data)
+            display_performance_metrics(labels, scaled_data)
+        with tab4:
+            st.header("Gaussian Mixture Model")
+            labels, gmm = gaussian_mixture_clustering(scaled_data, n_components)
+            plot_scatter(df, labels, "Gaussian Mixture Model", scaled_data)
+            display_performance_metrics(labels, scaled_data)
+        with tab5:
+            st.header("Feature Correlation Analysis")
+            numerical_df = df.select_dtypes(include=[np.number])
+            corr_matrix = numerical_df.corr()
+            fig, ax = plt.subplots(figsize=(10, 8))
+            sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', ax=ax)
+            st.pyplot(fig)
+        # User input for prediction
+        st.sidebar.header("Input New Data Point")
+        new_data = {}
+        # Store the original categorical values before encoding
+        original_values = {}
+        le_dict = {}
+        for col in categorical_columns:
+            le = LabelEncoder()
+            original_values[col] = df[col].unique()
+            le_dict[col] = le.fit(original_values[col])
+        for col in df.columns:
+            if col in categorical_columns:
+                # Use original values for display but store encoded value
+                selected_value = st.sidebar.selectbox(f"Select {col}", original_values[col])
+                new_data[col] = le_dict[col].transform([selected_value])[0]
+            else:
+                mean_value = np.clip(df[col].mean(), 1.0, 5.0)
+                new_data[col] = st.sidebar.number_input(f"Enter {col}", value=float(mean_value))
+        new_data_df = pd.DataFrame([new_data])
+        # Scale the numerical features of the new data point
+        numerical_features = ['Rating', 'Reviews', 'Size', 'Installs', 'Price']
+        new_data_numerical = new_data_df[numerical_features]
+        new_data_scaled = scaler.transform(new_data_numerical)
+        # Add encoded categorical features
+        new_data_scaled = np.hstack([
+            new_data_scaled,
+            new_data_df[[col for col in new_data_df.columns if col in categorical_columns]].values
+        ])
+        # Predict cluster for new data point
+        st.sidebar.header("Cluster Prediction")
+        if st.sidebar.button("Predict"):
+            kmeans_label = kmeans.predict(new_data_scaled)
+            dbscan_label = dbscan.fit_predict(new_data_scaled)
+            agglomerative_label = [-1]
+            gmm_label = gmm.predict(new_data_scaled)
+            # Find similar apps based on cluster
+            kmeans_cluster_apps = df[kmeans.labels_ == kmeans_label[0]]
+            gmm_cluster_apps = df[gmm.predict(scaled_data) == gmm_label[0]]
+            st.sidebar.write(f"KMeans Cluster: {kmeans_label[0]}")
+            st.sidebar.write(f"DBSCAN Cluster: {dbscan_label[0]}")
+            st.sidebar.write(f"Agglomerative Cluster: {agglomerative_label[0]}")
+            st.sidebar.write(f"GMM Cluster: {gmm_label[0]}")
+        # Download results
+        st.sidebar.header("Download Results")
+        if st.sidebar.button("Download Results"):
+            results = pd.DataFrame({
+                'Cluster (KMeans)': labels,
+                'Cluster (DBSCAN)': dbscan.labels_,
+                'Cluster (Agglomerative)': agglomerative.labels_,
+                'Cluster (GMM)': gmm.predict(scaled_data)
+            })
+            st.sidebar.download_button("Download CSV", results.to_csv(index=False), "results.csv")
+if __name__ == "__main__":
+    main()

googleplaystoreapps.csv ADDED Viewed

The diff for this file is too large to render. See raw diff

requirements.txt ADDED Viewed

	@@ -0,0 +1,6 @@

+streamlit
+pandas
+numpy
+scikit-learn
+matplotlib
+seaborn