Spaces:
Build error
Build error
File size: 12,534 Bytes
1ff4c54 eb9a571 3b33677 eb9a571 0df6d66 10bcdc3 0df6d66 296a6db cf3451a 0df6d66 62b26f7 10bcdc3 b5b0661 430b9e7 0df6d66 eb9a571 0df6d66 eb9a571 205ca40 eb9a571 0df6d66 eb9a571 92fae65 eb9a571 0df6d66 eb9a571 e77f587 eb9a571 0df6d66 8336158 0df6d66 6102e1f f488371 8336158 6102e1f 0df6d66 80907a1 0df6d66 80907a1 8336158 0df6d66 62b26f7 e43973a c142685 62b26f7 0df6d66 e43973a 0df6d66 f25d93a 0df6d66 cf3451a f25d93a e43973a 0df6d66 e43973a 0df6d66 5d5343f 0df6d66 eb9a571 0df6d66 a1e8fa8 0df6d66 ff72a94 0df6d66 ff72a94 0df6d66 4795f03 0df6d66 ff72a94 0df6d66 ff72a94 0df6d66 e92abc4 0df6d66 4795f03 0df6d66 2fb8103 f498bcc | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 | import streamlit as st
import pandas as pd
import numpy as np
from sklearn import datasets
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
import plotly.express as px
import base64
import plotly.figure_factory as ff
import plotly.graph_objects as go
from scipy.spatial import ConvexHull
from scipy.spatial import distance
from sklearn.decomposition import PCA
st.set_page_config(layout="wide")
# JS hack to add a toggle button for the sidebar
st.markdown("""
<style>
.reportview-container .main .block-container {
max-width: 100%;
}
</style>
""", unsafe_allow_html=True)
# Load iris dataset
iris = datasets.load_iris()
X = iris.data
st.title('Understanding K-Means Clustering')
tab1, tab2, about = st.tabs(["Basic ☕", "Advanced 🔬"," ℹ️ About"])
if "toggle" not in st.session_state:
st.session_state.toggle = True
toggle_button = st.button("Toggle Sidebar")
if toggle_button:
st.session_state.toggle = not st.session_state.toggle
dmojis = ["0️⃣", "1️⃣", "2️⃣", "3️⃣", "4️⃣", "5️⃣", "6️⃣", "7️⃣", "8️⃣", "9️⃣"]
# Initialize user_features and n_clusters_advanced outside of any condition
user_features = [6.5, 3.5, 4.5, 1.5]
n_clusters_advanced = 2
if st.session_state.toggle:
# User Input on Sidebar
st.sidebar.header('Input Your Flower Data')
def user_input_features():
sepal_length = st.sidebar.slider('Sepal Length (cm)', 4.0, 8.0, 6.5)
sepal_width = st.sidebar.slider('Sepal Width (cm)', 2.0, 4.5, 3.5)
petal_length = st.sidebar.slider('Petal Length (cm)', 1.0, 7.0, 4.5)
petal_width = st.sidebar.slider('Petal Width (cm)', 0.1, 2.5, 1.5)
return [sepal_length, sepal_width, petal_length, petal_width]
user_features = user_input_features() # Update the user_features variable when sliders change
# Slider for Advanced in the sidebar
st.sidebar.header('K-Means Parameters')
n_clusters_advanced = st.sidebar.slider('Number of Clusters (K)', 1, 8, n_clusters_advanced)
st.markdown("""
<style>
.reportview-container .main .block-container {
overflow: auto;
height: 2000px;
}
</style>
""", unsafe_allow_html=True)
with tab1:
st.write("""
### What is Clustering?
##### Clustering with K-Means is a machine learning concept like tidying a messy room by grouping similar items, but for data instead of physical objects.
""")
# Button to toggle PCA
if st.button('Toggle PCA for Visualization'):
st.session_state.use_pca = not st.session_state.use_pca
# Check if 'use_pca' is already in the session state
if 'use_pca' not in st.session_state:
st.session_state.use_pca = True
if st.session_state.use_pca:
st.write("""
##### 🧠 PCA (Principal Component Analysis) is like looking at a messy room from the best angle to see the most mess. It helps us see our data more clearly!
""")
# Apply PCA for dimensionality reduction
pca = PCA(n_components=2)
X_transformed = pca.fit_transform(X)
user_features_transformed = pca.transform([user_features])[0]
else:
X_transformed = X[:, :2] # Just use the first two features for visualization
user_features_transformed = user_features[:2]
st.write("""
### Visualizing Groups
##### Here are the groups from our tidying method. Each color has a number at its center, representing its group.
""")
# Create a DataFrame for easier plotting with plotly
df_transformed = pd.DataFrame(X_transformed, columns=['Feature1', 'Feature2'])
# K-Means Algorithm
kmeans = KMeans(n_clusters=n_clusters_advanced)
y_kmeans = kmeans.fit_predict(X_transformed)
df_transformed['cluster'] = y_kmeans
# Predict the cluster for the user input in the transformed space
predicted_cluster = kmeans.predict([user_features_transformed])
# For tab1
fig = go.Figure()
# Add shaded regions using convex hull
for cluster in np.unique(y_kmeans):
cluster_data = df_transformed[df_transformed['cluster'] == cluster]
x_data = cluster_data['Feature1'].values
y_data = cluster_data['Feature2'].values
if len(cluster_data) > 2: # ConvexHull requires at least 3 points
hull = ConvexHull(cluster_data[['Feature1', 'Feature2']])
fig.add_trace(go.Scatter(x=x_data[hull.vertices], y=y_data[hull.vertices], fill='toself', fillcolor=px.colors.qualitative.Set1[cluster], opacity=0.5, line=dict(width=0), showlegend=False))
# Add scatter plot based on PCA toggle
if st.session_state.use_pca:
fig.add_trace(go.Scatter(x=df_transformed['Feature1'], y=df_transformed['Feature2'], mode='markers', marker=dict(color=y_kmeans, colorscale=px.colors.qualitative.Set1), showlegend=False))
else:
fig.add_trace(go.Scatter(x=df_transformed['Feature1'], y=df_transformed['Feature2'], mode='markers', marker=dict(color=y_kmeans, colorscale=px.colors.qualitative.Set1, symbol='square'), showlegend=False))
# Add user input as a star marker
fig.add_trace(go.Scatter(x=[user_features_transformed[0]], y=[user_features_transformed[1]], mode='markers', marker=dict(symbol='star', size=30, color='white')))
# Add centroids with group numbers
for i, coord in enumerate(kmeans.cluster_centers_):
fig.add_annotation(
x=coord[0],
y=coord[1],
text=dmojis[i+1],
showarrow=True,
font=dict(color='white', size=30)
)
# Update layout
fig.update_layout(width=1200, height=500)
st.plotly_chart(fig)
# Button to toggle PCA
if st.button('Toggle PCA for Visualization',key=125):
st.session_state.use_pca = not st.session_state.use_pca
if st.session_state.use_pca:
st.write("""
##### 🧠 PCA (Principal Component Analysis) is like looking at a messy room from the best angle to see the most mess. It helps us see our data more clearly!
""")
st.write(f"##### Overlapping clusters mean some flowers are very similar and hard to tell apart just by looking at these features.")
st.write(f"# Based on your flower data (⭐), it likely belongs to **Group {dmojis[predicted_cluster[0]+1]}**")
# Closing Note
st.write("""
### Wrap Up
##### Just as sorting toys in a room, we group flowers by features; adjust the data to pick a flower and set how many boxes (groups) you want to use.
""")
with tab2:
st.write("""
## Advanced Overview of Clustering
Clustering, in the context of machine learning, refers to the task of partitioning the dataset into groups, known as clusters. The aim is to segregate groups with similar traits and assign them into clusters.
### K-Means Algorithm
The K-Means clustering method is an iterative method that tries to partition the dataset into \(K\) pre-defined distinct non-overlapping subgroups (clusters) where each data point belongs to only one group.
Here's a brief rundown:
1. **Initialization**: Choose \(K\) initial centroids. (Centroids is a fancy term for 'the center of the cluster'.)
2. **Assignment**: Assign each data point to the nearest centroid. All the points assigned to a centroid form a cluster.
3. **Update**: Recompute the centroid of each cluster.
4. **Repeat**: Keep repeating steps 2 and 3 until the centroids no longer move too much.
""")
st.write("The mathematical goal is to minimize the within-cluster sum of squares. The formula is:")
st.latex(r'''
\mathrm{WCSS} = \sum_{i=1}^{K} \sum_{x \in C_i} \| x - \mu_i \|^2
''')
st.latex(r'''
\begin{align*}
\text{Where:} \\
& \mathrm{WCSS} \text{ is the within-cluster sum of squares we want to minimize.} \\
& K \text{ is the number of clusters.} \\
& C_i \text{ is the i-th cluster.} \\
& \mu_i \text{ is the centroid of the i-th cluster.} \\
& x \text{ is a data point in cluster } C_i.
\end{align*}
''')
st.write("""
The K-Means algorithm tries to find the best centroids such that the \( \mathrm{WCSS} \) is minimized.
""")
# Button to toggle PCA
if st.button('Toggle PCA for Visualization', key=12):
st.session_state.use_pca = not st.session_state.use_pca
# Check if 'use_pca' is already in the session state
if 'use_pca' not in st.session_state:
st.session_state.use_pca = True
if st.session_state.use_pca:
st.write("""
##### 🧠 PCA (Principal Component Analysis) is a mathematical technique that helps us view our data from the best perspective. It identifies the directions (principal components) that maximize variance, allowing us to see patterns and structures more clearly.
""")
# Apply PCA for dimensionality reduction
pca = PCA(n_components=2)
X_transformed = pca.fit_transform(X)
user_features_transformed = pca.transform([user_features])[0]
else:
X_transformed = X[:, :2] # Just use the first two features for visualization
user_features_transformed = user_features[:2]
# K-Means Algorithm for Advanced Tab
kmeans_advanced = KMeans(n_clusters=n_clusters_advanced)
y_kmeans_advanced = kmeans_advanced.fit_predict(X_transformed)
# Create a DataFrame for easier plotting with plotly
df_transformed = pd.DataFrame(X_transformed, columns=['Feature1', 'Feature2'])
df_transformed['cluster'] = y_kmeans_advanced
fig_advanced = go.Figure()
# Add shaded regions using convex hull
for cluster in np.unique(y_kmeans_advanced):
cluster_data = df_transformed[df_transformed['cluster'] == cluster]
x_data = cluster_data['Feature1'].values
y_data = cluster_data['Feature2'].values
if len(cluster_data) > 2: # ConvexHull requires at least 3 points
hull = ConvexHull(cluster_data[['Feature1', 'Feature2']])
fig_advanced.add_trace(go.Scatter(x=x_data[hull.vertices], y=y_data[hull.vertices], fill='toself', fillcolor=px.colors.qualitative.Set1[cluster], opacity=0.5, line=dict(width=0), showlegend=False))
# Add scatter plot based on PCA toggle
fig_advanced.add_trace(go.Scatter(x=df_transformed['Feature1'], y=df_transformed['Feature2'], mode='markers', marker=dict(color=y_kmeans_advanced, colorscale=px.colors.qualitative.Set1), showlegend=False))
# Add user input as a star marker
fig_advanced.add_trace(go.Scatter(x=[user_features_transformed[0]], y=[user_features_transformed[1]], mode='markers', marker=dict(symbol='star', size=30, color='white')))
# Add centroids with group numbers
for i, coord in enumerate(kmeans_advanced.cluster_centers_):
fig_advanced.add_annotation(
x=coord[0],
y=coord[1],
text=dmojis[i+1],
showarrow=True,
font=dict(color='white', size=30)
)
# Update layout
fig_advanced.update_layout(width=1200, height=500)
st.plotly_chart(fig_advanced)
st.write("""
### Interpretation
The plot displays how data points are grouped into clusters. The big gray X marks represent the center of each cluster, known as centroids. The positioning of these centroids is determined by the mean of all data points in the cluster.
Keep in mind that the positioning of these centroids is crucial, as they determine the grouping of data. The algorithm tries to place them in such a way that the distance between the data points and their respective centroid is minimized.
**Feel free to adjust the number of clusters to see how data points get re-grouped!**
""")
with about:
st.title("About")
st.markdown("""
## Created by **Mustafa Alhamad**.
""")
st.markdown('[<img src="https://www.iconpacks.net/icons/2/free-linkedin-logo-icon-2430-thumb.png" width="128" height="128"/>](https://www.linkedin.com/in/mustafa-al-hamad-975b67213/)', unsafe_allow_html=True)
st.markdown('### Made with <img src="https://streamlit.io/images/brand/streamlit-logo-secondary-colormark-darktext.svg" width="512" height="512"/>', unsafe_allow_html=True)
hide_streamlit_style = """
<style>
[data-testid="stToolbar"] {visibility: hidden !important;}
footer {visibility: hidden !important;}
</style>
"""
st.markdown(hide_streamlit_style, unsafe_allow_html=True) |