Spaces:
Sleeping
Sleeping
wow
Browse files- app.py +197 -0
- requirements.txt +6 -0
app.py
ADDED
|
@@ -0,0 +1,197 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import streamlit as st
|
| 2 |
+
from sklearn.datasets import load_iris
|
| 3 |
+
from sklearn.cluster import KMeans
|
| 4 |
+
from sklearn.preprocessing import StandardScaler
|
| 5 |
+
import pandas as pd
|
| 6 |
+
import seaborn as sns
|
| 7 |
+
import matplotlib.pyplot as plt
|
| 8 |
+
|
| 9 |
+
# Load and preprocess the Iris dataset
|
| 10 |
+
@st.cache_data
|
| 11 |
+
def load_data():
|
| 12 |
+
iris = load_iris()
|
| 13 |
+
X = iris.data
|
| 14 |
+
feature_names = iris.feature_names
|
| 15 |
+
|
| 16 |
+
scaler = StandardScaler()
|
| 17 |
+
X_scaled = scaler.fit_transform(X)
|
| 18 |
+
|
| 19 |
+
return X, X_scaled, feature_names
|
| 20 |
+
|
| 21 |
+
X, X_scaled, feature_names = load_data()
|
| 22 |
+
|
| 23 |
+
# Perform K-Means clustering
|
| 24 |
+
@st.cache_data
|
| 25 |
+
def perform_clustering(X_scaled, n_clusters=3):
|
| 26 |
+
kmeans = KMeans(n_clusters=n_clusters, random_state=42)
|
| 27 |
+
clusters = kmeans.fit_predict(X_scaled)
|
| 28 |
+
return kmeans, clusters
|
| 29 |
+
|
| 30 |
+
kmeans, clusters = perform_clustering(X_scaled)
|
| 31 |
+
|
| 32 |
+
# Create a DataFrame with the clustering results
|
| 33 |
+
@st.cache_data
|
| 34 |
+
def create_clustered_dataframe(X, clusters, feature_names):
|
| 35 |
+
df = pd.DataFrame(X, columns=feature_names)
|
| 36 |
+
df['Cluster'] = clusters
|
| 37 |
+
|
| 38 |
+
# Assign meaningful labels to clusters based on analysis
|
| 39 |
+
cluster_labels = {0: 'Setosa-like', 1: 'Versicolor-like', 2: 'Virginica-like'}
|
| 40 |
+
df['Cluster Label'] = df['Cluster'].map(cluster_labels)
|
| 41 |
+
|
| 42 |
+
return df, cluster_labels
|
| 43 |
+
|
| 44 |
+
df, cluster_labels = create_clustered_dataframe(X, clusters, feature_names)
|
| 45 |
+
|
| 46 |
+
# --------------------- Streamlit App ---------------------
|
| 47 |
+
st.set_page_config(page_title="Unsupervised ML: Iris Clustering", layout="wide")
|
| 48 |
+
|
| 49 |
+
# β
App Title
|
| 50 |
+
st.title("πΈ Unsupervised Machine Learning: Iris Clustering App")
|
| 51 |
+
|
| 52 |
+
# Tabs for organization
|
| 53 |
+
tab1, tab2, tab3 = st.tabs(["π About", "π Data Visualization", "π Model Prediction"])
|
| 54 |
+
|
| 55 |
+
# ------------- About Tab -------------
|
| 56 |
+
with tab1:
|
| 57 |
+
st.header("About This App")
|
| 58 |
+
st.markdown("""
|
| 59 |
+
## **Overview**
|
| 60 |
+
This application demonstrates **unsupervised machine learning** using the Iris dataset.
|
| 61 |
+
The app clusters data points based on the features of iris flowers using the **K-Means clustering algorithm**.
|
| 62 |
+
After clustering, meaningful labels are assigned based on the clusterβs statistical properties.
|
| 63 |
+
|
| 64 |
+
## **How It Works**
|
| 65 |
+
1. **Data Preprocessing:**
|
| 66 |
+
- The dataset is standardized using `StandardScaler` to ensure uniform feature scaling.
|
| 67 |
+
|
| 68 |
+
2. **Clustering:**
|
| 69 |
+
- K-Means clustering is applied to group the data into **three clusters**.
|
| 70 |
+
- The number of clusters is based on the natural grouping of the Iris dataset.
|
| 71 |
+
|
| 72 |
+
3. **Cluster Labeling:**
|
| 73 |
+
- After clustering, each cluster is assigned a meaningful label based on its centroid properties and domain knowledge.
|
| 74 |
+
|
| 75 |
+
4. **Model Testing:**
|
| 76 |
+
- The app allows the user to enter custom feature values.
|
| 77 |
+
- The model predicts the cluster and assigns a meaningful label to the input data.
|
| 78 |
+
|
| 79 |
+
## **Dataset Information**
|
| 80 |
+
The Iris dataset contains **150 samples** of iris flowers.
|
| 81 |
+
Each sample includes the following features:
|
| 82 |
+
- πΈ Sepal Length (cm)
|
| 83 |
+
- πΈ Sepal Width (cm)
|
| 84 |
+
- πΈ Petal Length (cm)
|
| 85 |
+
- πΈ Petal Width (cm)
|
| 86 |
+
|
| 87 |
+
The goal of clustering is to find natural patterns among these measurements.
|
| 88 |
+
""")
|
| 89 |
+
|
| 90 |
+
# ------------- Data Visualization Tab -------------
|
| 91 |
+
with tab2:
|
| 92 |
+
st.header("Data Visualization")
|
| 93 |
+
|
| 94 |
+
# β
Cluster distribution plot
|
| 95 |
+
st.subheader("Cluster Distribution")
|
| 96 |
+
fig, ax = plt.subplots()
|
| 97 |
+
sns.scatterplot(
|
| 98 |
+
x=df['sepal length (cm)'],
|
| 99 |
+
y=df['sepal width (cm)'],
|
| 100 |
+
hue=df['Cluster Label'],
|
| 101 |
+
palette='viridis',
|
| 102 |
+
s=100,
|
| 103 |
+
alpha=0.7,
|
| 104 |
+
ax=ax
|
| 105 |
+
)
|
| 106 |
+
plt.xlabel('Sepal Length (cm)')
|
| 107 |
+
plt.ylabel('Sepal Width (cm)')
|
| 108 |
+
st.pyplot(fig)
|
| 109 |
+
|
| 110 |
+
# β
Heatmap (Fixed by dropping non-numeric columns)
|
| 111 |
+
st.subheader("Heatmap of Feature Correlation")
|
| 112 |
+
numeric_df = df.drop(columns=["Cluster", "Cluster Label"]) # Drop non-numeric columns
|
| 113 |
+
fig, ax = plt.subplots(figsize=(6, 4))
|
| 114 |
+
sns.heatmap(numeric_df.corr(), annot=True, cmap="coolwarm", fmt=".2f", ax=ax)
|
| 115 |
+
st.pyplot(fig)
|
| 116 |
+
|
| 117 |
+
# β
Box plots (Replaced pair plot for better clarity)
|
| 118 |
+
st.subheader("Box Plot of Features by Cluster")
|
| 119 |
+
fig, ax = plt.subplots(figsize=(10, 6))
|
| 120 |
+
sns.boxplot(x='Cluster Label', y='sepal length (cm)', data=df, palette='viridis', ax=ax)
|
| 121 |
+
plt.title("Sepal Length Distribution Across Clusters")
|
| 122 |
+
st.pyplot(fig)
|
| 123 |
+
|
| 124 |
+
fig, ax = plt.subplots(figsize=(10, 6))
|
| 125 |
+
sns.boxplot(x='Cluster Label', y='petal length (cm)', data=df, palette='viridis', ax=ax)
|
| 126 |
+
plt.title("Petal Length Distribution Across Clusters")
|
| 127 |
+
st.pyplot(fig)
|
| 128 |
+
|
| 129 |
+
# β
Feature importance (Tabular format with explanation)
|
| 130 |
+
st.subheader("Feature Importance (Based on Cluster Centers)")
|
| 131 |
+
feature_importance = pd.DataFrame(
|
| 132 |
+
kmeans.cluster_centers_,
|
| 133 |
+
columns=feature_names,
|
| 134 |
+
index=[f'Cluster {i}' for i in range(len(kmeans.cluster_centers_))]
|
| 135 |
+
)
|
| 136 |
+
st.dataframe(feature_importance)
|
| 137 |
+
|
| 138 |
+
st.markdown("""
|
| 139 |
+
**How to Interpret Positive and Negative Values:**
|
| 140 |
+
- **Positive Value:** The cluster center is positioned **above the mean** for that feature.
|
| 141 |
+
β The cluster tends to have **higher values** for that feature.
|
| 142 |
+
- **Negative Value:** The cluster center is positioned **below the mean** for that feature.
|
| 143 |
+
β The cluster tends to have **lower values** for that feature.
|
| 144 |
+
- **Magnitude:**
|
| 145 |
+
- Higher absolute values = Stronger influence of that feature in defining the cluster.
|
| 146 |
+
- Lower absolute values = Less influence of that feature in cluster formation.
|
| 147 |
+
""")
|
| 148 |
+
|
| 149 |
+
# ------------- Model Prediction Tab -------------
|
| 150 |
+
with tab3:
|
| 151 |
+
st.header("Predict Cluster for Custom Input")
|
| 152 |
+
|
| 153 |
+
# β
Collect user input for prediction
|
| 154 |
+
input_features = []
|
| 155 |
+
for feature in feature_names:
|
| 156 |
+
value = st.number_input(f"Enter {feature}", value=0.0, step=0.1)
|
| 157 |
+
input_features.append(value)
|
| 158 |
+
|
| 159 |
+
# β
Scale input data
|
| 160 |
+
input_scaled = StandardScaler().fit(X).transform([input_features])
|
| 161 |
+
|
| 162 |
+
if st.button("Predict Cluster"):
|
| 163 |
+
cluster = kmeans.predict(input_scaled)[0]
|
| 164 |
+
label = cluster_labels[cluster]
|
| 165 |
+
st.success(f"The predicted cluster is: **{label}**")
|
| 166 |
+
|
| 167 |
+
# β
Show cluster center distances with explanation
|
| 168 |
+
if st.checkbox("Show Cluster Distances"):
|
| 169 |
+
st.markdown("""
|
| 170 |
+
**What is Cluster Distance?**
|
| 171 |
+
- Cluster distance represents how close your custom input is to each cluster center.
|
| 172 |
+
- A smaller distance means your input is more similar to that cluster's typical values.
|
| 173 |
+
""")
|
| 174 |
+
|
| 175 |
+
distances = kmeans.transform(input_scaled)[0]
|
| 176 |
+
distance_df = pd.DataFrame(
|
| 177 |
+
distances,
|
| 178 |
+
index=[f'Cluster {i}' for i in range(len(distances))],
|
| 179 |
+
columns=["Distance"]
|
| 180 |
+
)
|
| 181 |
+
st.write(distance_df)
|
| 182 |
+
|
| 183 |
+
# β
Plot distances
|
| 184 |
+
fig, ax = plt.subplots()
|
| 185 |
+
sns.barplot(
|
| 186 |
+
x=distance_df.index,
|
| 187 |
+
y=distance_df["Distance"],
|
| 188 |
+
palette="viridis",
|
| 189 |
+
ax=ax
|
| 190 |
+
)
|
| 191 |
+
ax.set_title("Distance to Cluster Centers")
|
| 192 |
+
ax.set_ylabel("Distance")
|
| 193 |
+
st.pyplot(fig)
|
| 194 |
+
|
| 195 |
+
# --------------------- Footer ---------------------
|
| 196 |
+
st.markdown("---")
|
| 197 |
+
st.write("**Awesome π**")
|
requirements.txt
ADDED
|
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
streamlit
|
| 2 |
+
scikit-learn
|
| 3 |
+
matplotlib
|
| 4 |
+
seaborn
|
| 5 |
+
pandas
|
| 6 |
+
numpy
|