File size: 12,534 Bytes
1ff4c54
eb9a571
3b33677
eb9a571
 
0df6d66
10bcdc3
0df6d66
296a6db
 
cf3451a
0df6d66
62b26f7
10bcdc3
b5b0661
430b9e7
0df6d66
eb9a571
 
 
 
 
 
 
 
 
 
 
 
 
0df6d66
 
eb9a571
 
 
 
 
 
 
 
 
205ca40
eb9a571
0df6d66
eb9a571
92fae65
eb9a571
 
 
 
 
 
 
 
 
 
 
 
0df6d66
eb9a571
 
 
e77f587
eb9a571
 
0df6d66
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8336158
 
0df6d66
6102e1f
f488371
8336158
6102e1f
0df6d66
 
 
 
80907a1
 
 
 
0df6d66
80907a1
8336158
0df6d66
 
 
 
 
 
 
 
 
62b26f7
e43973a
c142685
62b26f7
0df6d66
e43973a
0df6d66
 
f25d93a
0df6d66
cf3451a
f25d93a
e43973a
0df6d66
 
 
e43973a
0df6d66
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5d5343f
 
0df6d66
 
 
 
 
 
 
 
 
eb9a571
0df6d66
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a1e8fa8
0df6d66
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ff72a94
0df6d66
 
 
 
 
 
 
 
ff72a94
0df6d66
 
4795f03
0df6d66
 
ff72a94
0df6d66
 
 
 
 
 
 
 
 
ff72a94
0df6d66
 
 
e92abc4
 
0df6d66
 
 
 
 
 
 
 
 
4795f03
0df6d66
 
 
 
 
 
 
2fb8103
f498bcc
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
import streamlit as st
import pandas as pd
import numpy as np
from sklearn import datasets
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
import plotly.express as px
import base64
import plotly.figure_factory as ff
import plotly.graph_objects as go
from scipy.spatial import ConvexHull
from scipy.spatial import distance
from sklearn.decomposition import PCA

st.set_page_config(layout="wide")

# JS hack to add a toggle button for the sidebar
st.markdown("""
    <style>
        .reportview-container .main .block-container {
            max-width: 100%;
        }
    </style>
    """, unsafe_allow_html=True)

# Load iris dataset
iris = datasets.load_iris()
X = iris.data

st.title('Understanding K-Means Clustering')

tab1, tab2, about = st.tabs(["Basic ☕", "Advanced 🔬"," ℹ️ About"])

if "toggle" not in st.session_state:
    st.session_state.toggle = True

toggle_button = st.button("Toggle Sidebar")

if toggle_button:
    st.session_state.toggle = not st.session_state.toggle

dmojis = ["0️⃣", "1️⃣", "2️⃣", "3️⃣", "4️⃣", "5️⃣", "6️⃣", "7️⃣", "8️⃣", "9️⃣"]

# Initialize user_features and n_clusters_advanced outside of any condition
user_features = [6.5, 3.5, 4.5, 1.5]
n_clusters_advanced = 2

if st.session_state.toggle:
    # User Input on Sidebar
    st.sidebar.header('Input Your Flower Data')
    
    def user_input_features():
        sepal_length = st.sidebar.slider('Sepal Length (cm)', 4.0, 8.0, 6.5)
        sepal_width = st.sidebar.slider('Sepal Width (cm)', 2.0, 4.5, 3.5)
        petal_length = st.sidebar.slider('Petal Length (cm)', 1.0, 7.0, 4.5)
        petal_width = st.sidebar.slider('Petal Width (cm)', 0.1, 2.5, 1.5)
        return [sepal_length, sepal_width, petal_length, petal_width]

    user_features = user_input_features()  # Update the user_features variable when sliders change
    
    # Slider for Advanced in the sidebar
    st.sidebar.header('K-Means Parameters')
    n_clusters_advanced = st.sidebar.slider('Number of Clusters (K)', 1, 8, n_clusters_advanced)


st.markdown("""
<style>
    .reportview-container .main .block-container {
        overflow: auto;
        height: 2000px; 
    }
</style>
""", unsafe_allow_html=True)

with tab1:
    st.write("""
    ### What is Clustering?
    ##### Clustering with K-Means is a machine learning concept like tidying a messy room by grouping similar items, but for data instead of physical objects.
    """)

    # Button to toggle PCA
    if st.button('Toggle PCA for Visualization'):
        st.session_state.use_pca = not st.session_state.use_pca

    # Check if 'use_pca' is already in the session state
    if 'use_pca' not in st.session_state:
        st.session_state.use_pca = True

    if st.session_state.use_pca:
        st.write("""
        ##### 🧠 PCA (Principal Component Analysis) is like looking at a messy room from the best angle to see the most mess. It helps us see our data more clearly!
        """)
        # Apply PCA for dimensionality reduction
        pca = PCA(n_components=2)
        X_transformed = pca.fit_transform(X)
        user_features_transformed = pca.transform([user_features])[0]
    else:
        X_transformed = X[:, :2]  # Just use the first two features for visualization
        user_features_transformed = user_features[:2]

    st.write("""
    ### Visualizing Groups
    ##### Here are the groups from our tidying method. Each color has a number at its center, representing its group.
    """)
    
    # Create a DataFrame for easier plotting with plotly
    df_transformed = pd.DataFrame(X_transformed, columns=['Feature1', 'Feature2'])
    
    # K-Means Algorithm
    kmeans = KMeans(n_clusters=n_clusters_advanced)
    y_kmeans = kmeans.fit_predict(X_transformed)
    df_transformed['cluster'] = y_kmeans
    
    # Predict the cluster for the user input in the transformed space
    predicted_cluster = kmeans.predict([user_features_transformed])
    
    # For tab1
    fig = go.Figure()
    
    # Add shaded regions using convex hull
    for cluster in np.unique(y_kmeans):
        cluster_data = df_transformed[df_transformed['cluster'] == cluster]
        x_data = cluster_data['Feature1'].values
        y_data = cluster_data['Feature2'].values
        if len(cluster_data) > 2:  # ConvexHull requires at least 3 points
            hull = ConvexHull(cluster_data[['Feature1', 'Feature2']])
            fig.add_trace(go.Scatter(x=x_data[hull.vertices], y=y_data[hull.vertices], fill='toself', fillcolor=px.colors.qualitative.Set1[cluster], opacity=0.5, line=dict(width=0), showlegend=False))
    
    # Add scatter plot based on PCA toggle
    if st.session_state.use_pca:
        fig.add_trace(go.Scatter(x=df_transformed['Feature1'], y=df_transformed['Feature2'], mode='markers', marker=dict(color=y_kmeans, colorscale=px.colors.qualitative.Set1), showlegend=False))
    else:
        fig.add_trace(go.Scatter(x=df_transformed['Feature1'], y=df_transformed['Feature2'], mode='markers', marker=dict(color=y_kmeans, colorscale=px.colors.qualitative.Set1, symbol='square'), showlegend=False))
    
    # Add user input as a star marker
    fig.add_trace(go.Scatter(x=[user_features_transformed[0]], y=[user_features_transformed[1]], mode='markers', marker=dict(symbol='star', size=30, color='white')))
    
    # Add centroids with group numbers
    for i, coord in enumerate(kmeans.cluster_centers_):
        fig.add_annotation(
            x=coord[0], 
            y=coord[1],
            text=dmojis[i+1],
            showarrow=True,
            font=dict(color='white', size=30)
        )
    
    # Update layout
    fig.update_layout(width=1200, height=500)
    st.plotly_chart(fig)
    
    # Button to toggle PCA
    if st.button('Toggle PCA for Visualization',key=125):
        st.session_state.use_pca = not st.session_state.use_pca
    
    if st.session_state.use_pca:
        st.write("""
        ##### 🧠 PCA (Principal Component Analysis) is like looking at a messy room from the best angle to see the most mess. It helps us see our data more clearly!
        """)
    
    st.write(f"##### Overlapping clusters mean some flowers are very similar and hard to tell apart just by looking at these features.")
    st.write(f"# Based on your flower data (⭐), it likely belongs to **Group {dmojis[predicted_cluster[0]+1]}**")
    
    # Closing Note
    st.write("""
    ### Wrap Up    
    ##### Just as sorting toys in a room, we group flowers by features; adjust the data to pick a flower and set how many boxes (groups) you want to use.
    """)


with tab2:
    st.write("""
    ## Advanced Overview of Clustering

    Clustering, in the context of machine learning, refers to the task of partitioning the dataset into groups, known as clusters. The aim is to segregate groups with similar traits and assign them into clusters.

    ### K-Means Algorithm

    The K-Means clustering method is an iterative method that tries to partition the dataset into \(K\) pre-defined distinct non-overlapping subgroups (clusters) where each data point belongs to only one group.

    Here's a brief rundown:

    1. **Initialization**: Choose \(K\) initial centroids. (Centroids is a fancy term for 'the center of the cluster'.)
    2. **Assignment**: Assign each data point to the nearest centroid. All the points assigned to a centroid form a cluster.
    3. **Update**: Recompute the centroid of each cluster.
    4. **Repeat**: Keep repeating steps 2 and 3 until the centroids no longer move too much.
    """)

    st.write("The mathematical goal is to minimize the within-cluster sum of squares. The formula is:")
    st.latex(r'''
    \mathrm{WCSS} = \sum_{i=1}^{K} \sum_{x \in C_i} \| x - \mu_i \|^2
    ''')

    st.latex(r'''
    \begin{align*}
    \text{Where:} \\
    & \mathrm{WCSS} \text{ is the within-cluster sum of squares we want to minimize.} \\
    & K \text{ is the number of clusters.} \\
    & C_i \text{ is the i-th cluster.} \\
    & \mu_i \text{ is the centroid of the i-th cluster.} \\
    & x \text{ is a data point in cluster } C_i.
    \end{align*}
    ''')

    st.write("""
    The K-Means algorithm tries to find the best centroids such that the \( \mathrm{WCSS} \) is minimized.
    """)

    # Button to toggle PCA
    if st.button('Toggle PCA for Visualization', key=12):
        st.session_state.use_pca = not st.session_state.use_pca

    # Check if 'use_pca' is already in the session state
    if 'use_pca' not in st.session_state:
        st.session_state.use_pca = True

    if st.session_state.use_pca:
        st.write("""
        ##### 🧠 PCA (Principal Component Analysis) is a mathematical technique that helps us view our data from the best perspective. It identifies the directions (principal components) that maximize variance, allowing us to see patterns and structures more clearly.
        """)
        # Apply PCA for dimensionality reduction
        pca = PCA(n_components=2)
        X_transformed = pca.fit_transform(X)
        user_features_transformed = pca.transform([user_features])[0]
    else:
        X_transformed = X[:, :2]  # Just use the first two features for visualization
        user_features_transformed = user_features[:2]

    # K-Means Algorithm for Advanced Tab
    kmeans_advanced = KMeans(n_clusters=n_clusters_advanced)
    y_kmeans_advanced = kmeans_advanced.fit_predict(X_transformed)
    
    # Create a DataFrame for easier plotting with plotly
    df_transformed = pd.DataFrame(X_transformed, columns=['Feature1', 'Feature2'])
    df_transformed['cluster'] = y_kmeans_advanced

    fig_advanced = go.Figure()
    
    # Add shaded regions using convex hull
    for cluster in np.unique(y_kmeans_advanced):
        cluster_data = df_transformed[df_transformed['cluster'] == cluster]
        x_data = cluster_data['Feature1'].values
        y_data = cluster_data['Feature2'].values
        if len(cluster_data) > 2:  # ConvexHull requires at least 3 points
            hull = ConvexHull(cluster_data[['Feature1', 'Feature2']])
            fig_advanced.add_trace(go.Scatter(x=x_data[hull.vertices], y=y_data[hull.vertices], fill='toself', fillcolor=px.colors.qualitative.Set1[cluster], opacity=0.5, line=dict(width=0), showlegend=False))
    
    # Add scatter plot based on PCA toggle
    fig_advanced.add_trace(go.Scatter(x=df_transformed['Feature1'], y=df_transformed['Feature2'], mode='markers', marker=dict(color=y_kmeans_advanced, colorscale=px.colors.qualitative.Set1), showlegend=False))
    
    # Add user input as a star marker
    fig_advanced.add_trace(go.Scatter(x=[user_features_transformed[0]], y=[user_features_transformed[1]], mode='markers', marker=dict(symbol='star', size=30, color='white')))
    
    # Add centroids with group numbers
    for i, coord in enumerate(kmeans_advanced.cluster_centers_):
        fig_advanced.add_annotation(
            x=coord[0], 
            y=coord[1],
            text=dmojis[i+1],
            showarrow=True,
            font=dict(color='white', size=30)
        )
    
    # Update layout
    fig_advanced.update_layout(width=1200, height=500)
    st.plotly_chart(fig_advanced)

    st.write("""
    ### Interpretation

    The plot displays how data points are grouped into clusters. The big gray X marks represent the center of each cluster, known as centroids. The positioning of these centroids is determined by the mean of all data points in the cluster.

    Keep in mind that the positioning of these centroids is crucial, as they determine the grouping of data. The algorithm tries to place them in such a way that the distance between the data points and their respective centroid is minimized.

    **Feel free to adjust the number of clusters to see how data points get re-grouped!**
    """)


with about:
    st.title("About")
    st.markdown("""
    ## Created by **Mustafa Alhamad**.
    """)
    st.markdown('[<img src="https://www.iconpacks.net/icons/2/free-linkedin-logo-icon-2430-thumb.png" width="128" height="128"/>](https://www.linkedin.com/in/mustafa-al-hamad-975b67213/)', unsafe_allow_html=True)
    st.markdown('### Made with <img src="https://streamlit.io/images/brand/streamlit-logo-secondary-colormark-darktext.svg" width="512" height="512"/>', unsafe_allow_html=True)

hide_streamlit_style = """
            <style>
            [data-testid="stToolbar"] {visibility: hidden !important;}
            footer {visibility: hidden !important;}
            </style>
            """
st.markdown(hide_streamlit_style, unsafe_allow_html=True)