kheejay88 commited on
Commit
3ab4fb7
Β·
1 Parent(s): b204417
Files changed (2) hide show
  1. app.py +197 -0
  2. requirements.txt +6 -0
app.py ADDED
@@ -0,0 +1,197 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ from sklearn.datasets import load_iris
3
+ from sklearn.cluster import KMeans
4
+ from sklearn.preprocessing import StandardScaler
5
+ import pandas as pd
6
+ import seaborn as sns
7
+ import matplotlib.pyplot as plt
8
+
9
+ # Load and preprocess the Iris dataset
10
+ @st.cache_data
11
+ def load_data():
12
+ iris = load_iris()
13
+ X = iris.data
14
+ feature_names = iris.feature_names
15
+
16
+ scaler = StandardScaler()
17
+ X_scaled = scaler.fit_transform(X)
18
+
19
+ return X, X_scaled, feature_names
20
+
21
+ X, X_scaled, feature_names = load_data()
22
+
23
+ # Perform K-Means clustering
24
+ @st.cache_data
25
+ def perform_clustering(X_scaled, n_clusters=3):
26
+ kmeans = KMeans(n_clusters=n_clusters, random_state=42)
27
+ clusters = kmeans.fit_predict(X_scaled)
28
+ return kmeans, clusters
29
+
30
+ kmeans, clusters = perform_clustering(X_scaled)
31
+
32
+ # Create a DataFrame with the clustering results
33
+ @st.cache_data
34
+ def create_clustered_dataframe(X, clusters, feature_names):
35
+ df = pd.DataFrame(X, columns=feature_names)
36
+ df['Cluster'] = clusters
37
+
38
+ # Assign meaningful labels to clusters based on analysis
39
+ cluster_labels = {0: 'Setosa-like', 1: 'Versicolor-like', 2: 'Virginica-like'}
40
+ df['Cluster Label'] = df['Cluster'].map(cluster_labels)
41
+
42
+ return df, cluster_labels
43
+
44
+ df, cluster_labels = create_clustered_dataframe(X, clusters, feature_names)
45
+
46
+ # --------------------- Streamlit App ---------------------
47
+ st.set_page_config(page_title="Unsupervised ML: Iris Clustering", layout="wide")
48
+
49
+ # βœ… App Title
50
+ st.title("🌸 Unsupervised Machine Learning: Iris Clustering App")
51
+
52
+ # Tabs for organization
53
+ tab1, tab2, tab3 = st.tabs(["🏠 About", "πŸ“Š Data Visualization", "πŸ”Ž Model Prediction"])
54
+
55
+ # ------------- About Tab -------------
56
+ with tab1:
57
+ st.header("About This App")
58
+ st.markdown("""
59
+ ## **Overview**
60
+ This application demonstrates **unsupervised machine learning** using the Iris dataset.
61
+ The app clusters data points based on the features of iris flowers using the **K-Means clustering algorithm**.
62
+ After clustering, meaningful labels are assigned based on the cluster’s statistical properties.
63
+
64
+ ## **How It Works**
65
+ 1. **Data Preprocessing:**
66
+ - The dataset is standardized using `StandardScaler` to ensure uniform feature scaling.
67
+
68
+ 2. **Clustering:**
69
+ - K-Means clustering is applied to group the data into **three clusters**.
70
+ - The number of clusters is based on the natural grouping of the Iris dataset.
71
+
72
+ 3. **Cluster Labeling:**
73
+ - After clustering, each cluster is assigned a meaningful label based on its centroid properties and domain knowledge.
74
+
75
+ 4. **Model Testing:**
76
+ - The app allows the user to enter custom feature values.
77
+ - The model predicts the cluster and assigns a meaningful label to the input data.
78
+
79
+ ## **Dataset Information**
80
+ The Iris dataset contains **150 samples** of iris flowers.
81
+ Each sample includes the following features:
82
+ - 🌸 Sepal Length (cm)
83
+ - 🌸 Sepal Width (cm)
84
+ - 🌸 Petal Length (cm)
85
+ - 🌸 Petal Width (cm)
86
+
87
+ The goal of clustering is to find natural patterns among these measurements.
88
+ """)
89
+
90
+ # ------------- Data Visualization Tab -------------
91
+ with tab2:
92
+ st.header("Data Visualization")
93
+
94
+ # βœ… Cluster distribution plot
95
+ st.subheader("Cluster Distribution")
96
+ fig, ax = plt.subplots()
97
+ sns.scatterplot(
98
+ x=df['sepal length (cm)'],
99
+ y=df['sepal width (cm)'],
100
+ hue=df['Cluster Label'],
101
+ palette='viridis',
102
+ s=100,
103
+ alpha=0.7,
104
+ ax=ax
105
+ )
106
+ plt.xlabel('Sepal Length (cm)')
107
+ plt.ylabel('Sepal Width (cm)')
108
+ st.pyplot(fig)
109
+
110
+ # βœ… Heatmap (Fixed by dropping non-numeric columns)
111
+ st.subheader("Heatmap of Feature Correlation")
112
+ numeric_df = df.drop(columns=["Cluster", "Cluster Label"]) # Drop non-numeric columns
113
+ fig, ax = plt.subplots(figsize=(6, 4))
114
+ sns.heatmap(numeric_df.corr(), annot=True, cmap="coolwarm", fmt=".2f", ax=ax)
115
+ st.pyplot(fig)
116
+
117
+ # βœ… Box plots (Replaced pair plot for better clarity)
118
+ st.subheader("Box Plot of Features by Cluster")
119
+ fig, ax = plt.subplots(figsize=(10, 6))
120
+ sns.boxplot(x='Cluster Label', y='sepal length (cm)', data=df, palette='viridis', ax=ax)
121
+ plt.title("Sepal Length Distribution Across Clusters")
122
+ st.pyplot(fig)
123
+
124
+ fig, ax = plt.subplots(figsize=(10, 6))
125
+ sns.boxplot(x='Cluster Label', y='petal length (cm)', data=df, palette='viridis', ax=ax)
126
+ plt.title("Petal Length Distribution Across Clusters")
127
+ st.pyplot(fig)
128
+
129
+ # βœ… Feature importance (Tabular format with explanation)
130
+ st.subheader("Feature Importance (Based on Cluster Centers)")
131
+ feature_importance = pd.DataFrame(
132
+ kmeans.cluster_centers_,
133
+ columns=feature_names,
134
+ index=[f'Cluster {i}' for i in range(len(kmeans.cluster_centers_))]
135
+ )
136
+ st.dataframe(feature_importance)
137
+
138
+ st.markdown("""
139
+ **How to Interpret Positive and Negative Values:**
140
+ - **Positive Value:** The cluster center is positioned **above the mean** for that feature.
141
+ β†’ The cluster tends to have **higher values** for that feature.
142
+ - **Negative Value:** The cluster center is positioned **below the mean** for that feature.
143
+ β†’ The cluster tends to have **lower values** for that feature.
144
+ - **Magnitude:**
145
+ - Higher absolute values = Stronger influence of that feature in defining the cluster.
146
+ - Lower absolute values = Less influence of that feature in cluster formation.
147
+ """)
148
+
149
+ # ------------- Model Prediction Tab -------------
150
+ with tab3:
151
+ st.header("Predict Cluster for Custom Input")
152
+
153
+ # βœ… Collect user input for prediction
154
+ input_features = []
155
+ for feature in feature_names:
156
+ value = st.number_input(f"Enter {feature}", value=0.0, step=0.1)
157
+ input_features.append(value)
158
+
159
+ # βœ… Scale input data
160
+ input_scaled = StandardScaler().fit(X).transform([input_features])
161
+
162
+ if st.button("Predict Cluster"):
163
+ cluster = kmeans.predict(input_scaled)[0]
164
+ label = cluster_labels[cluster]
165
+ st.success(f"The predicted cluster is: **{label}**")
166
+
167
+ # βœ… Show cluster center distances with explanation
168
+ if st.checkbox("Show Cluster Distances"):
169
+ st.markdown("""
170
+ **What is Cluster Distance?**
171
+ - Cluster distance represents how close your custom input is to each cluster center.
172
+ - A smaller distance means your input is more similar to that cluster's typical values.
173
+ """)
174
+
175
+ distances = kmeans.transform(input_scaled)[0]
176
+ distance_df = pd.DataFrame(
177
+ distances,
178
+ index=[f'Cluster {i}' for i in range(len(distances))],
179
+ columns=["Distance"]
180
+ )
181
+ st.write(distance_df)
182
+
183
+ # βœ… Plot distances
184
+ fig, ax = plt.subplots()
185
+ sns.barplot(
186
+ x=distance_df.index,
187
+ y=distance_df["Distance"],
188
+ palette="viridis",
189
+ ax=ax
190
+ )
191
+ ax.set_title("Distance to Cluster Centers")
192
+ ax.set_ylabel("Distance")
193
+ st.pyplot(fig)
194
+
195
+ # --------------------- Footer ---------------------
196
+ st.markdown("---")
197
+ st.write("**Awesome 😎**")
requirements.txt ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ streamlit
2
+ scikit-learn
3
+ matplotlib
4
+ seaborn
5
+ pandas
6
+ numpy