File size: 9,906 Bytes
28a5f7d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
import streamlit as st
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.cluster import KMeans, DBSCAN, AgglomerativeClustering
from sklearn.mixture import GaussianMixture
from sklearn.decomposition import PCA
from sklearn.metrics import silhouette_score
import plotly.express as px

# Function to load and preprocess the data
def load_and_preprocess_data(file_uploaded):
    try:
        df = pd.read_csv(file_uploaded)
        df = df.dropna()

        # Encode categorical variables
        le = LabelEncoder()
        categorical_columns = ['Category', 'Content Rating', 'Genres', 'Type']
        for col in categorical_columns:
            df[col + '_encoded'] = le.fit_transform(df[col])
        
        # Replace 'Varies with device' with mean size
        df['Size'] = df['Size'].replace('Varies with device', df[df['Size'] != 'Varies with device']['Size'].mode()[0])

        # Convert 'Size' to numeric
        df['Size'] = df['Size'].apply(lambda x: float(str(x).replace('M', '')) if 'M' in str(x) else float(str(x).replace('k', '')) / 1000)

        # Convert 'Installs' to numeric
        df['Installs'] = df['Installs'].apply(lambda x: int(str(x).replace('+', '').replace(',', '')))

        # Convert 'Price' to numeric
        df['Price'] = df['Price'].apply(lambda x: float(str(x).replace('$', '')))

        # Select relevant features for clustering
        features = ['Category', 'Rating', 'Reviews', 'Size', 'Installs', 'Type', 'Price', 'Content Rating', 'Genres']
        df_features = df[features]
        df = df_features.copy()

        # Separate numerical and encoded categorical features
        numerical_features = ['Rating', 'Reviews', 'Size', 'Installs', 'Price']
        categorical_encoded = [col + '_encoded' for col in categorical_columns]
        
        # Scale only numerical features
        scaler = StandardScaler()
        df_scaled = pd.DataFrame(scaler.fit_transform(df[numerical_features]), columns=numerical_features)
        
        # Add encoded categorical features to scaled data
        for col, base_col in zip(categorical_encoded, categorical_columns):
            df_scaled[col] = le.fit_transform(df[base_col])
        
        scaled_data = df_scaled.values

        return df, scaled_data, scaler
    except Exception as e:
        st.error(f"Error loading and preprocessing data: {e}")

# Function to implement KMeans
def kmeans_clustering(scaled_data, n_clusters):
    kmeans = KMeans(n_clusters=n_clusters, random_state=42)
    kmeans.fit(scaled_data)
    return kmeans.labels_, kmeans

# Function to implement DBSCAN
def dbscan_clustering(scaled_data, eps, min_samples):
    dbscan = DBSCAN(eps=eps, min_samples=min_samples)
    dbscan.fit(scaled_data)
    return dbscan.labels_, dbscan

# Function to implement Agglomerative Clustering
def agglomerative_clustering(scaled_data, n_clusters):
    agglomerative = AgglomerativeClustering(n_clusters=n_clusters)
    agglomerative.fit(scaled_data)
    return agglomerative.labels_, agglomerative

# Function to implement Gaussian Mixture Model
def gaussian_mixture_clustering(scaled_data, n_components):
    gmm = GaussianMixture(n_components=n_components, random_state=42)
    gmm.fit(scaled_data)
    return gmm.predict(scaled_data), gmm

# Function to plot scatter plot
def plot_scatter(df, labels, title, scaled_data):
    pca = PCA(n_components=2)
    reduced_data = pca.fit_transform(scaled_data)
    df_pca = pd.DataFrame(reduced_data, columns=['PC1', 'PC2'])
    df_pca['Cluster'] = labels
    fig = px.scatter(df_pca, x='PC1', y='PC2', color='Cluster', title=title)
    st.plotly_chart(fig)

# Function to plot elbow curve
def plot_elbow_curve(scaled_data, max_clusters):
    wcss = []
    for i in range(1, max_clusters + 1):
        kmeans = KMeans(n_clusters=i, random_state=42)
        kmeans.fit(scaled_data)
        wcss.append(kmeans.inertia_)
    fig, ax = plt.subplots()
    ax.plot(range(1, max_clusters + 1), wcss, marker='o')
    ax.set_title('Elbow Curve')
    ax.set_xlabel('Number of Clusters')
    ax.set_ylabel('WCSS')
    st.pyplot(fig)

# Function to display performance metrics
def display_performance_metrics(labels, scaled_data):
    if len(set(labels)) > 1:
        silhouette = silhouette_score(scaled_data, labels)
        st.write(f"Silhouette Score: {silhouette:.2f}")
    else:
        st.write("Silhouette Score cannot be computed for a single cluster.")

# Define categorical columns globally
categorical_columns = ['Category', 'Content Rating', 'Genres', 'Type']

# Main function
def main():
    st.title("Unsupervised Learning for App Recommendation")

    # File upload
    file = st.sidebar.file_uploader("Upload CSV file", type=["csv"])
    if file is None:
        file = './googleplaystoreapps.csv'
    if file is not None:
        # Sidebar for parameter tuning
        st.sidebar.header("Upload Custom Data Here")
        df, scaled_data, scaler = load_and_preprocess_data(file)
        st.sidebar.header("Parameter Tuning")
        n_clusters = st.sidebar.slider("Number of Clusters", 2, 10, 3)
        eps = st.sidebar.slider("Epsilon (DBSCAN)", 0.1, 1.0, 0.5, 0.1)
        min_samples = st.sidebar.slider("Minimum Samples (DBSCAN)", 1, 10, 5)
        n_components = st.sidebar.slider("Number of Components (GMM)", 2, 10, 3)

        # Tabs for different algorithms
        tab1, tab2, tab3, tab4, tab5 = st.tabs(["KMeans", "DBSCAN", "Agglomerative Clustering", "Gaussian Mixture Model", "Feature Correlation"])

        with tab1:
            st.header("KMeans Clustering")
            labels, kmeans = kmeans_clustering(scaled_data, n_clusters)
            plot_scatter(df, labels, "KMeans Clustering", scaled_data)
            display_performance_metrics(labels, scaled_data)
            plot_elbow_curve(scaled_data, 10)

        with tab2:
            st.header("DBSCAN Clustering")
            labels, dbscan = dbscan_clustering(scaled_data, eps, min_samples)
            plot_scatter(df, labels, "DBSCAN Clustering", scaled_data)
            display_performance_metrics(labels, scaled_data)

        with tab3:
            st.header("Agglomerative Clustering")
            labels, agglomerative = agglomerative_clustering(scaled_data, n_clusters)
            plot_scatter(df, labels, "Agglomerative Clustering", scaled_data)
            display_performance_metrics(labels, scaled_data)

        with tab4:
            st.header("Gaussian Mixture Model")
            labels, gmm = gaussian_mixture_clustering(scaled_data, n_components)
            plot_scatter(df, labels, "Gaussian Mixture Model", scaled_data)
            display_performance_metrics(labels, scaled_data)

        with tab5:
            st.header("Feature Correlation Analysis")
            numerical_df = df.select_dtypes(include=[np.number])
            corr_matrix = numerical_df.corr()
            fig, ax = plt.subplots(figsize=(10, 8))
            sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', ax=ax)
            st.pyplot(fig)

        # User input for prediction
        st.sidebar.header("Input New Data Point")
        new_data = {}
        # Store the original categorical values before encoding
        original_values = {}
        le_dict = {}
        for col in categorical_columns:
            le = LabelEncoder()
            original_values[col] = df[col].unique()
            le_dict[col] = le.fit(original_values[col])

        for col in df.columns:
            if col in categorical_columns:
                # Use original values for display but store encoded value
                selected_value = st.sidebar.selectbox(f"Select {col}", original_values[col])
                new_data[col] = le_dict[col].transform([selected_value])[0]
            else:
                mean_value = np.clip(df[col].mean(), 1.0, 5.0)
                new_data[col] = st.sidebar.number_input(f"Enter {col}", value=float(mean_value))

        new_data_df = pd.DataFrame([new_data])
        # Scale the numerical features of the new data point
        numerical_features = ['Rating', 'Reviews', 'Size', 'Installs', 'Price']
        new_data_numerical = new_data_df[numerical_features]
        new_data_scaled = scaler.transform(new_data_numerical)
        
        # Add encoded categorical features
        new_data_scaled = np.hstack([
            new_data_scaled,
            new_data_df[[col for col in new_data_df.columns if col in categorical_columns]].values
        ])

        # Predict cluster for new data point
        st.sidebar.header("Cluster Prediction")
        if st.sidebar.button("Predict"):
            kmeans_label = kmeans.predict(new_data_scaled)
            dbscan_label = dbscan.fit_predict(new_data_scaled)
            agglomerative_label = [-1]
            gmm_label = gmm.predict(new_data_scaled)
            
            # Find similar apps based on cluster
            kmeans_cluster_apps = df[kmeans.labels_ == kmeans_label[0]]
            gmm_cluster_apps = df[gmm.predict(scaled_data) == gmm_label[0]]

            st.sidebar.write(f"KMeans Cluster: {kmeans_label[0]}")
            st.sidebar.write(f"DBSCAN Cluster: {dbscan_label[0]}")
            st.sidebar.write(f"Agglomerative Cluster: {agglomerative_label[0]}")
            st.sidebar.write(f"GMM Cluster: {gmm_label[0]}")

        # Download results
        st.sidebar.header("Download Results")
        if st.sidebar.button("Download Results"):
            results = pd.DataFrame({
                'Cluster (KMeans)': labels,
                'Cluster (DBSCAN)': dbscan.labels_,
                'Cluster (Agglomerative)': agglomerative.labels_,
                'Cluster (GMM)': gmm.predict(scaled_data)
            })
            st.sidebar.download_button("Download CSV", results.to_csv(index=False), "results.csv")

if __name__ == "__main__":
    main()