kheejay88 commited on
Commit
1b013af
Β·
verified Β·
1 Parent(s): 3ab4fb7

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +198 -197
app.py CHANGED
@@ -1,197 +1,198 @@
1
- import streamlit as st
2
- from sklearn.datasets import load_iris
3
- from sklearn.cluster import KMeans
4
- from sklearn.preprocessing import StandardScaler
5
- import pandas as pd
6
- import seaborn as sns
7
- import matplotlib.pyplot as plt
8
-
9
- # Load and preprocess the Iris dataset
10
- @st.cache_data
11
- def load_data():
12
- iris = load_iris()
13
- X = iris.data
14
- feature_names = iris.feature_names
15
-
16
- scaler = StandardScaler()
17
- X_scaled = scaler.fit_transform(X)
18
-
19
- return X, X_scaled, feature_names
20
-
21
- X, X_scaled, feature_names = load_data()
22
-
23
- # Perform K-Means clustering
24
- @st.cache_data
25
- def perform_clustering(X_scaled, n_clusters=3):
26
- kmeans = KMeans(n_clusters=n_clusters, random_state=42)
27
- clusters = kmeans.fit_predict(X_scaled)
28
- return kmeans, clusters
29
-
30
- kmeans, clusters = perform_clustering(X_scaled)
31
-
32
- # Create a DataFrame with the clustering results
33
- @st.cache_data
34
- def create_clustered_dataframe(X, clusters, feature_names):
35
- df = pd.DataFrame(X, columns=feature_names)
36
- df['Cluster'] = clusters
37
-
38
- # Assign meaningful labels to clusters based on analysis
39
- cluster_labels = {0: 'Setosa-like', 1: 'Versicolor-like', 2: 'Virginica-like'}
40
- df['Cluster Label'] = df['Cluster'].map(cluster_labels)
41
-
42
- return df, cluster_labels
43
-
44
- df, cluster_labels = create_clustered_dataframe(X, clusters, feature_names)
45
-
46
- # --------------------- Streamlit App ---------------------
47
- st.set_page_config(page_title="Unsupervised ML: Iris Clustering", layout="wide")
48
-
49
- # βœ… App Title
50
- st.title("🌸 Unsupervised Machine Learning: Iris Clustering App")
51
-
52
- # Tabs for organization
53
- tab1, tab2, tab3 = st.tabs(["🏠 About", "πŸ“Š Data Visualization", "πŸ”Ž Model Prediction"])
54
-
55
- # ------------- About Tab -------------
56
- with tab1:
57
- st.header("About This App")
58
- st.markdown("""
59
- ## **Overview**
60
- This application demonstrates **unsupervised machine learning** using the Iris dataset.
61
- The app clusters data points based on the features of iris flowers using the **K-Means clustering algorithm**.
62
- After clustering, meaningful labels are assigned based on the cluster’s statistical properties.
63
-
64
- ## **How It Works**
65
- 1. **Data Preprocessing:**
66
- - The dataset is standardized using `StandardScaler` to ensure uniform feature scaling.
67
-
68
- 2. **Clustering:**
69
- - K-Means clustering is applied to group the data into **three clusters**.
70
- - The number of clusters is based on the natural grouping of the Iris dataset.
71
-
72
- 3. **Cluster Labeling:**
73
- - After clustering, each cluster is assigned a meaningful label based on its centroid properties and domain knowledge.
74
-
75
- 4. **Model Testing:**
76
- - The app allows the user to enter custom feature values.
77
- - The model predicts the cluster and assigns a meaningful label to the input data.
78
-
79
- ## **Dataset Information**
80
- The Iris dataset contains **150 samples** of iris flowers.
81
- Each sample includes the following features:
82
- - 🌸 Sepal Length (cm)
83
- - 🌸 Sepal Width (cm)
84
- - 🌸 Petal Length (cm)
85
- - 🌸 Petal Width (cm)
86
-
87
- The goal of clustering is to find natural patterns among these measurements.
88
- """)
89
-
90
- # ------------- Data Visualization Tab -------------
91
- with tab2:
92
- st.header("Data Visualization")
93
-
94
- # βœ… Cluster distribution plot
95
- st.subheader("Cluster Distribution")
96
- fig, ax = plt.subplots()
97
- sns.scatterplot(
98
- x=df['sepal length (cm)'],
99
- y=df['sepal width (cm)'],
100
- hue=df['Cluster Label'],
101
- palette='viridis',
102
- s=100,
103
- alpha=0.7,
104
- ax=ax
105
- )
106
- plt.xlabel('Sepal Length (cm)')
107
- plt.ylabel('Sepal Width (cm)')
108
- st.pyplot(fig)
109
-
110
- # βœ… Heatmap (Fixed by dropping non-numeric columns)
111
- st.subheader("Heatmap of Feature Correlation")
112
- numeric_df = df.drop(columns=["Cluster", "Cluster Label"]) # Drop non-numeric columns
113
- fig, ax = plt.subplots(figsize=(6, 4))
114
- sns.heatmap(numeric_df.corr(), annot=True, cmap="coolwarm", fmt=".2f", ax=ax)
115
- st.pyplot(fig)
116
-
117
- # βœ… Box plots (Replaced pair plot for better clarity)
118
- st.subheader("Box Plot of Features by Cluster")
119
- fig, ax = plt.subplots(figsize=(10, 6))
120
- sns.boxplot(x='Cluster Label', y='sepal length (cm)', data=df, palette='viridis', ax=ax)
121
- plt.title("Sepal Length Distribution Across Clusters")
122
- st.pyplot(fig)
123
-
124
- fig, ax = plt.subplots(figsize=(10, 6))
125
- sns.boxplot(x='Cluster Label', y='petal length (cm)', data=df, palette='viridis', ax=ax)
126
- plt.title("Petal Length Distribution Across Clusters")
127
- st.pyplot(fig)
128
-
129
- # βœ… Feature importance (Tabular format with explanation)
130
- st.subheader("Feature Importance (Based on Cluster Centers)")
131
- feature_importance = pd.DataFrame(
132
- kmeans.cluster_centers_,
133
- columns=feature_names,
134
- index=[f'Cluster {i}' for i in range(len(kmeans.cluster_centers_))]
135
- )
136
- st.dataframe(feature_importance)
137
-
138
- st.markdown("""
139
- **How to Interpret Positive and Negative Values:**
140
- - **Positive Value:** The cluster center is positioned **above the mean** for that feature.
141
- β†’ The cluster tends to have **higher values** for that feature.
142
- - **Negative Value:** The cluster center is positioned **below the mean** for that feature.
143
- β†’ The cluster tends to have **lower values** for that feature.
144
- - **Magnitude:**
145
- - Higher absolute values = Stronger influence of that feature in defining the cluster.
146
- - Lower absolute values = Less influence of that feature in cluster formation.
147
- """)
148
-
149
- # ------------- Model Prediction Tab -------------
150
- with tab3:
151
- st.header("Predict Cluster for Custom Input")
152
-
153
- # βœ… Collect user input for prediction
154
- input_features = []
155
- for feature in feature_names:
156
- value = st.number_input(f"Enter {feature}", value=0.0, step=0.1)
157
- input_features.append(value)
158
-
159
- # βœ… Scale input data
160
- input_scaled = StandardScaler().fit(X).transform([input_features])
161
-
162
- if st.button("Predict Cluster"):
163
- cluster = kmeans.predict(input_scaled)[0]
164
- label = cluster_labels[cluster]
165
- st.success(f"The predicted cluster is: **{label}**")
166
-
167
- # βœ… Show cluster center distances with explanation
168
- if st.checkbox("Show Cluster Distances"):
169
- st.markdown("""
170
- **What is Cluster Distance?**
171
- - Cluster distance represents how close your custom input is to each cluster center.
172
- - A smaller distance means your input is more similar to that cluster's typical values.
173
- """)
174
-
175
- distances = kmeans.transform(input_scaled)[0]
176
- distance_df = pd.DataFrame(
177
- distances,
178
- index=[f'Cluster {i}' for i in range(len(distances))],
179
- columns=["Distance"]
180
- )
181
- st.write(distance_df)
182
-
183
- # βœ… Plot distances
184
- fig, ax = plt.subplots()
185
- sns.barplot(
186
- x=distance_df.index,
187
- y=distance_df["Distance"],
188
- palette="viridis",
189
- ax=ax
190
- )
191
- ax.set_title("Distance to Cluster Centers")
192
- ax.set_ylabel("Distance")
193
- st.pyplot(fig)
194
-
195
- # --------------------- Footer ---------------------
196
- st.markdown("---")
197
- st.write("**Awesome 😎**")
 
 
1
+ import streamlit as st
2
+ from sklearn.datasets import load_iris
3
+ from sklearn.cluster import KMeans
4
+ from sklearn.preprocessing import StandardScaler
5
+ import pandas as pd
6
+ import seaborn as sns
7
+ import matplotlib.pyplot as plt
8
+
9
+ # --------------------- Streamlit App ---------------------
10
+ st.set_page_config(page_title="Unsupervised ML: Iris Clustering", layout="wide")
11
+
12
+ # Load and preprocess the Iris dataset
13
+ @st.cache_data
14
+ def load_data():
15
+ iris = load_iris()
16
+ X = iris.data
17
+ feature_names = iris.feature_names
18
+
19
+ scaler = StandardScaler()
20
+ X_scaled = scaler.fit_transform(X)
21
+
22
+ return X, X_scaled, feature_names
23
+
24
+ X, X_scaled, feature_names = load_data()
25
+
26
+ # Perform K-Means clustering
27
+ @st.cache_data
28
+ def perform_clustering(X_scaled, n_clusters=3):
29
+ kmeans = KMeans(n_clusters=n_clusters, random_state=42)
30
+ clusters = kmeans.fit_predict(X_scaled)
31
+ return kmeans, clusters
32
+
33
+ kmeans, clusters = perform_clustering(X_scaled)
34
+
35
+ # Create a DataFrame with the clustering results
36
+ @st.cache_data
37
+ def create_clustered_dataframe(X, clusters, feature_names):
38
+ df = pd.DataFrame(X, columns=feature_names)
39
+ df['Cluster'] = clusters
40
+
41
+ # Assign meaningful labels to clusters based on analysis
42
+ cluster_labels = {0: 'Setosa-like', 1: 'Versicolor-like', 2: 'Virginica-like'}
43
+ df['Cluster Label'] = df['Cluster'].map(cluster_labels)
44
+
45
+ return df, cluster_labels
46
+
47
+ df, cluster_labels = create_clustered_dataframe(X, clusters, feature_names)
48
+
49
+
50
+ # βœ… App Title
51
+ st.title("🌸 Unsupervised Machine Learning: Iris Clustering App")
52
+
53
+ # Tabs for organization
54
+ tab1, tab2, tab3 = st.tabs(["🏠 About", "πŸ“Š Data Visualization", "πŸ”Ž Model Prediction"])
55
+
56
+ # ------------- About Tab -------------
57
+ with tab1:
58
+ st.header("About This App")
59
+ st.markdown("""
60
+ ## **Overview**
61
+ This application demonstrates **unsupervised machine learning** using the Iris dataset.
62
+ The app clusters data points based on the features of iris flowers using the **K-Means clustering algorithm**.
63
+ After clustering, meaningful labels are assigned based on the cluster’s statistical properties.
64
+
65
+ ## **How It Works**
66
+ 1. **Data Preprocessing:**
67
+ - The dataset is standardized using `StandardScaler` to ensure uniform feature scaling.
68
+
69
+ 2. **Clustering:**
70
+ - K-Means clustering is applied to group the data into **three clusters**.
71
+ - The number of clusters is based on the natural grouping of the Iris dataset.
72
+
73
+ 3. **Cluster Labeling:**
74
+ - After clustering, each cluster is assigned a meaningful label based on its centroid properties and domain knowledge.
75
+
76
+ 4. **Model Testing:**
77
+ - The app allows the user to enter custom feature values.
78
+ - The model predicts the cluster and assigns a meaningful label to the input data.
79
+
80
+ ## **Dataset Information**
81
+ The Iris dataset contains **150 samples** of iris flowers.
82
+ Each sample includes the following features:
83
+ - 🌸 Sepal Length (cm)
84
+ - 🌸 Sepal Width (cm)
85
+ - 🌸 Petal Length (cm)
86
+ - 🌸 Petal Width (cm)
87
+
88
+ The goal of clustering is to find natural patterns among these measurements.
89
+ """)
90
+
91
+ # ------------- Data Visualization Tab -------------
92
+ with tab2:
93
+ st.header("Data Visualization")
94
+
95
+ # βœ… Cluster distribution plot
96
+ st.subheader("Cluster Distribution")
97
+ fig, ax = plt.subplots()
98
+ sns.scatterplot(
99
+ x=df['sepal length (cm)'],
100
+ y=df['sepal width (cm)'],
101
+ hue=df['Cluster Label'],
102
+ palette='viridis',
103
+ s=100,
104
+ alpha=0.7,
105
+ ax=ax
106
+ )
107
+ plt.xlabel('Sepal Length (cm)')
108
+ plt.ylabel('Sepal Width (cm)')
109
+ st.pyplot(fig)
110
+
111
+ # βœ… Heatmap (Fixed by dropping non-numeric columns)
112
+ st.subheader("Heatmap of Feature Correlation")
113
+ numeric_df = df.drop(columns=["Cluster", "Cluster Label"]) # Drop non-numeric columns
114
+ fig, ax = plt.subplots(figsize=(6, 4))
115
+ sns.heatmap(numeric_df.corr(), annot=True, cmap="coolwarm", fmt=".2f", ax=ax)
116
+ st.pyplot(fig)
117
+
118
+ # βœ… Box plots (Replaced pair plot for better clarity)
119
+ st.subheader("Box Plot of Features by Cluster")
120
+ fig, ax = plt.subplots(figsize=(10, 6))
121
+ sns.boxplot(x='Cluster Label', y='sepal length (cm)', data=df, palette='viridis', ax=ax)
122
+ plt.title("Sepal Length Distribution Across Clusters")
123
+ st.pyplot(fig)
124
+
125
+ fig, ax = plt.subplots(figsize=(10, 6))
126
+ sns.boxplot(x='Cluster Label', y='petal length (cm)', data=df, palette='viridis', ax=ax)
127
+ plt.title("Petal Length Distribution Across Clusters")
128
+ st.pyplot(fig)
129
+
130
+ # βœ… Feature importance (Tabular format with explanation)
131
+ st.subheader("Feature Importance (Based on Cluster Centers)")
132
+ feature_importance = pd.DataFrame(
133
+ kmeans.cluster_centers_,
134
+ columns=feature_names,
135
+ index=[f'Cluster {i}' for i in range(len(kmeans.cluster_centers_))]
136
+ )
137
+ st.dataframe(feature_importance)
138
+
139
+ st.markdown("""
140
+ **How to Interpret Positive and Negative Values:**
141
+ - **Positive Value:** The cluster center is positioned **above the mean** for that feature.
142
+ β†’ The cluster tends to have **higher values** for that feature.
143
+ - **Negative Value:** The cluster center is positioned **below the mean** for that feature.
144
+ β†’ The cluster tends to have **lower values** for that feature.
145
+ - **Magnitude:**
146
+ - Higher absolute values = Stronger influence of that feature in defining the cluster.
147
+ - Lower absolute values = Less influence of that feature in cluster formation.
148
+ """)
149
+
150
+ # ------------- Model Prediction Tab -------------
151
+ with tab3:
152
+ st.header("Predict Cluster for Custom Input")
153
+
154
+ # βœ… Collect user input for prediction
155
+ input_features = []
156
+ for feature in feature_names:
157
+ value = st.number_input(f"Enter {feature}", value=0.0, step=0.1)
158
+ input_features.append(value)
159
+
160
+ # βœ… Scale input data
161
+ input_scaled = StandardScaler().fit(X).transform([input_features])
162
+
163
+ if st.button("Predict Cluster"):
164
+ cluster = kmeans.predict(input_scaled)[0]
165
+ label = cluster_labels[cluster]
166
+ st.success(f"The predicted cluster is: **{label}**")
167
+
168
+ # βœ… Show cluster center distances with explanation
169
+ if st.checkbox("Show Cluster Distances"):
170
+ st.markdown("""
171
+ **What is Cluster Distance?**
172
+ - Cluster distance represents how close your custom input is to each cluster center.
173
+ - A smaller distance means your input is more similar to that cluster's typical values.
174
+ """)
175
+
176
+ distances = kmeans.transform(input_scaled)[0]
177
+ distance_df = pd.DataFrame(
178
+ distances,
179
+ index=[f'Cluster {i}' for i in range(len(distances))],
180
+ columns=["Distance"]
181
+ )
182
+ st.write(distance_df)
183
+
184
+ # βœ… Plot distances
185
+ fig, ax = plt.subplots()
186
+ sns.barplot(
187
+ x=distance_df.index,
188
+ y=distance_df["Distance"],
189
+ palette="viridis",
190
+ ax=ax
191
+ )
192
+ ax.set_title("Distance to Cluster Centers")
193
+ ax.set_ylabel("Distance")
194
+ st.pyplot(fig)
195
+
196
+ # --------------------- Footer ---------------------
197
+ st.markdown("---")
198
+ st.write("**Awesome 😎**")