Spaces:

kheejay88
/

iris_clusttering_app

Sleeping

App Files Files Community

kheejay88 commited on Mar 11, 2025

Commit

1b013af

verified ·

1 Parent(s): 3ab4fb7

Update app.py

Browse files

Files changed (1) hide show

app.py +198 -197

app.py CHANGED Viewed

@@ -1,197 +1,198 @@
-import streamlit as st
-from sklearn.datasets import load_iris
-from sklearn.cluster import KMeans
-from sklearn.preprocessing import StandardScaler
-import pandas as pd
-import seaborn as sns
-import matplotlib.pyplot as plt
-# Load and preprocess the Iris dataset
-@st.cache_data
-def load_data():
-    iris = load_iris()
-    X = iris.data
-    feature_names = iris.feature_names
-    scaler = StandardScaler()
-    X_scaled = scaler.fit_transform(X)
-    return X, X_scaled, feature_names
-X, X_scaled, feature_names = load_data()
-# Perform K-Means clustering
-@st.cache_data
-def perform_clustering(X_scaled, n_clusters=3):
-    kmeans = KMeans(n_clusters=n_clusters, random_state=42)
-    clusters = kmeans.fit_predict(X_scaled)
-    return kmeans, clusters
-kmeans, clusters = perform_clustering(X_scaled)
-# Create a DataFrame with the clustering results
-@st.cache_data
-def create_clustered_dataframe(X, clusters, feature_names):
-    df = pd.DataFrame(X, columns=feature_names)
-    df['Cluster'] = clusters
-    # Assign meaningful labels to clusters based on analysis
-    cluster_labels = {0: 'Setosa-like', 1: 'Versicolor-like', 2: 'Virginica-like'}
-    df['Cluster Label'] = df['Cluster'].map(cluster_labels)
-    return df, cluster_labels
-df, cluster_labels = create_clustered_dataframe(X, clusters, feature_names)
-# --------------------- Streamlit App ---------------------
-st.set_page_config(page_title="Unsupervised ML: Iris Clustering", layout="wide")
-# ✅ App Title
-st.title("🌸 Unsupervised Machine Learning: Iris Clustering App")
-# Tabs for organization
-tab1, tab2, tab3 = st.tabs(["🏠 About", "📊 Data Visualization", "🔎 Model Prediction"])
-# ------------- About Tab -------------
-with tab1:
-    st.header("About This App")
-    st.markdown("""
-    ## **Overview**
-    This application demonstrates **unsupervised machine learning** using the Iris dataset.
-    The app clusters data points based on the features of iris flowers using the **K-Means clustering algorithm**.
-    After clustering, meaningful labels are assigned based on the cluster’s statistical properties.
-    ## **How It Works**
-    1. **Data Preprocessing:**
-        - The dataset is standardized using `StandardScaler` to ensure uniform feature scaling.
-    2. **Clustering:**
-        - K-Means clustering is applied to group the data into **three clusters**.
-        - The number of clusters is based on the natural grouping of the Iris dataset.
-    3. **Cluster Labeling:**
-        - After clustering, each cluster is assigned a meaningful label based on its centroid properties and domain knowledge.
-    4. **Model Testing:**
-        - The app allows the user to enter custom feature values.
-        - The model predicts the cluster and assigns a meaningful label to the input data.
-    ## **Dataset Information**
-    The Iris dataset contains **150 samples** of iris flowers.
-    Each sample includes the following features:
-    - 🌸 Sepal Length (cm)
-    - 🌸 Sepal Width (cm)
-    - 🌸 Petal Length (cm)
-    - 🌸 Petal Width (cm)
-    The goal of clustering is to find natural patterns among these measurements.
-    """)
-# ------------- Data Visualization Tab -------------
-with tab2:
-    st.header("Data Visualization")
-    # ✅ Cluster distribution plot
-    st.subheader("Cluster Distribution")
-    fig, ax = plt.subplots()
-    sns.scatterplot(
-        x=df['sepal length (cm)'],
-        y=df['sepal width (cm)'],
-        hue=df['Cluster Label'],
-        palette='viridis',
-        s=100,
-        alpha=0.7,
-        ax=ax
-    )
-    plt.xlabel('Sepal Length (cm)')
-    plt.ylabel('Sepal Width (cm)')
-    st.pyplot(fig)
-    # ✅ Heatmap (Fixed by dropping non-numeric columns)
-    st.subheader("Heatmap of Feature Correlation")
-    numeric_df = df.drop(columns=["Cluster", "Cluster Label"])  # Drop non-numeric columns
-    fig, ax = plt.subplots(figsize=(6, 4))
-    sns.heatmap(numeric_df.corr(), annot=True, cmap="coolwarm", fmt=".2f", ax=ax)
-    st.pyplot(fig)
-    # ✅ Box plots (Replaced pair plot for better clarity)
-    st.subheader("Box Plot of Features by Cluster")
-    fig, ax = plt.subplots(figsize=(10, 6))
-    sns.boxplot(x='Cluster Label', y='sepal length (cm)', data=df, palette='viridis', ax=ax)
-    plt.title("Sepal Length Distribution Across Clusters")
-    st.pyplot(fig)
-    fig, ax = plt.subplots(figsize=(10, 6))
-    sns.boxplot(x='Cluster Label', y='petal length (cm)', data=df, palette='viridis', ax=ax)
-    plt.title("Petal Length Distribution Across Clusters")
-    st.pyplot(fig)
-    # ✅ Feature importance (Tabular format with explanation)
-    st.subheader("Feature Importance (Based on Cluster Centers)")
-    feature_importance = pd.DataFrame(
-        kmeans.cluster_centers_,
-        columns=feature_names,
-        index=[f'Cluster {i}' for i in range(len(kmeans.cluster_centers_))]
-    )
-    st.dataframe(feature_importance)
-    st.markdown("""
-    **How to Interpret Positive and Negative Values:**
-    - **Positive Value:** The cluster center is positioned **above the mean** for that feature.
-      → The cluster tends to have **higher values** for that feature.
-    - **Negative Value:** The cluster center is positioned **below the mean** for that feature.
-      → The cluster tends to have **lower values** for that feature.
-    - **Magnitude:**
-        - Higher absolute values = Stronger influence of that feature in defining the cluster.
-        - Lower absolute values = Less influence of that feature in cluster formation.
-    """)
-# ------------- Model Prediction Tab -------------
-with tab3:
-    st.header("Predict Cluster for Custom Input")
-    # ✅ Collect user input for prediction
-    input_features = []
-    for feature in feature_names:
-        value = st.number_input(f"Enter {feature}", value=0.0, step=0.1)
-        input_features.append(value)
-    # ✅ Scale input data
-    input_scaled = StandardScaler().fit(X).transform([input_features])
-    if st.button("Predict Cluster"):
-        cluster = kmeans.predict(input_scaled)[0]
-        label = cluster_labels[cluster]
-        st.success(f"The predicted cluster is: **{label}**")
-    # ✅ Show cluster center distances with explanation
-    if st.checkbox("Show Cluster Distances"):
-        st.markdown("""
-        **What is Cluster Distance?**
-        - Cluster distance represents how close your custom input is to each cluster center.
-        - A smaller distance means your input is more similar to that cluster's typical values.
-        """)
-        distances = kmeans.transform(input_scaled)[0]
-        distance_df = pd.DataFrame(
-            distances,
-            index=[f'Cluster {i}' for i in range(len(distances))],
-            columns=["Distance"]
-        )
-        st.write(distance_df)
-        # ✅ Plot distances
-        fig, ax = plt.subplots()
-        sns.barplot(
-            x=distance_df.index,
-            y=distance_df["Distance"],
-            palette="viridis",
-            ax=ax
-        )
-        ax.set_title("Distance to Cluster Centers")
-        ax.set_ylabel("Distance")
-        st.pyplot(fig)
-# --------------------- Footer ---------------------
-st.markdown("---")
-st.write("**Awesome 😎**")

+import streamlit as st
+from sklearn.datasets import load_iris
+from sklearn.cluster import KMeans
+from sklearn.preprocessing import StandardScaler
+import pandas as pd
+import seaborn as sns
+import matplotlib.pyplot as plt
+# --------------------- Streamlit App ---------------------
+st.set_page_config(page_title="Unsupervised ML: Iris Clustering", layout="wide")
+# Load and preprocess the Iris dataset
+@st.cache_data
+def load_data():
+    iris = load_iris()
+    X = iris.data
+    feature_names = iris.feature_names
+    scaler = StandardScaler()
+    X_scaled = scaler.fit_transform(X)
+    return X, X_scaled, feature_names
+X, X_scaled, feature_names = load_data()
+# Perform K-Means clustering
+@st.cache_data
+def perform_clustering(X_scaled, n_clusters=3):
+    kmeans = KMeans(n_clusters=n_clusters, random_state=42)
+    clusters = kmeans.fit_predict(X_scaled)
+    return kmeans, clusters
+kmeans, clusters = perform_clustering(X_scaled)
+# Create a DataFrame with the clustering results
+@st.cache_data
+def create_clustered_dataframe(X, clusters, feature_names):
+    df = pd.DataFrame(X, columns=feature_names)
+    df['Cluster'] = clusters
+    # Assign meaningful labels to clusters based on analysis
+    cluster_labels = {0: 'Setosa-like', 1: 'Versicolor-like', 2: 'Virginica-like'}
+    df['Cluster Label'] = df['Cluster'].map(cluster_labels)
+    return df, cluster_labels
+df, cluster_labels = create_clustered_dataframe(X, clusters, feature_names)
+# ✅ App Title
+st.title("🌸 Unsupervised Machine Learning: Iris Clustering App")
+# Tabs for organization
+tab1, tab2, tab3 = st.tabs(["🏠 About", "📊 Data Visualization", "🔎 Model Prediction"])
+# ------------- About Tab -------------
+with tab1:
+    st.header("About This App")
+    st.markdown("""
+    ## **Overview**
+    This application demonstrates **unsupervised machine learning** using the Iris dataset.
+    The app clusters data points based on the features of iris flowers using the **K-Means clustering algorithm**.
+    After clustering, meaningful labels are assigned based on the cluster’s statistical properties.
+    ## **How It Works**
+    1. **Data Preprocessing:**
+        - The dataset is standardized using `StandardScaler` to ensure uniform feature scaling.
+    2. **Clustering:**
+        - K-Means clustering is applied to group the data into **three clusters**.
+        - The number of clusters is based on the natural grouping of the Iris dataset.
+    3. **Cluster Labeling:**
+        - After clustering, each cluster is assigned a meaningful label based on its centroid properties and domain knowledge.
+    4. **Model Testing:**
+        - The app allows the user to enter custom feature values.
+        - The model predicts the cluster and assigns a meaningful label to the input data.
+    ## **Dataset Information**
+    The Iris dataset contains **150 samples** of iris flowers.
+    Each sample includes the following features:
+    - 🌸 Sepal Length (cm)
+    - 🌸 Sepal Width (cm)
+    - 🌸 Petal Length (cm)
+    - 🌸 Petal Width (cm)
+    The goal of clustering is to find natural patterns among these measurements.
+    """)
+# ------------- Data Visualization Tab -------------
+with tab2:
+    st.header("Data Visualization")
+    # ✅ Cluster distribution plot
+    st.subheader("Cluster Distribution")
+    fig, ax = plt.subplots()
+    sns.scatterplot(
+        x=df['sepal length (cm)'],
+        y=df['sepal width (cm)'],
+        hue=df['Cluster Label'],
+        palette='viridis',
+        s=100,
+        alpha=0.7,
+        ax=ax
+    )
+    plt.xlabel('Sepal Length (cm)')
+    plt.ylabel('Sepal Width (cm)')
+    st.pyplot(fig)
+    # ✅ Heatmap (Fixed by dropping non-numeric columns)
+    st.subheader("Heatmap of Feature Correlation")
+    numeric_df = df.drop(columns=["Cluster", "Cluster Label"])  # Drop non-numeric columns
+    fig, ax = plt.subplots(figsize=(6, 4))
+    sns.heatmap(numeric_df.corr(), annot=True, cmap="coolwarm", fmt=".2f", ax=ax)
+    st.pyplot(fig)
+    # ✅ Box plots (Replaced pair plot for better clarity)
+    st.subheader("Box Plot of Features by Cluster")
+    fig, ax = plt.subplots(figsize=(10, 6))
+    sns.boxplot(x='Cluster Label', y='sepal length (cm)', data=df, palette='viridis', ax=ax)
+    plt.title("Sepal Length Distribution Across Clusters")
+    st.pyplot(fig)
+    fig, ax = plt.subplots(figsize=(10, 6))
+    sns.boxplot(x='Cluster Label', y='petal length (cm)', data=df, palette='viridis', ax=ax)
+    plt.title("Petal Length Distribution Across Clusters")
+    st.pyplot(fig)
+    # ✅ Feature importance (Tabular format with explanation)
+    st.subheader("Feature Importance (Based on Cluster Centers)")
+    feature_importance = pd.DataFrame(
+        kmeans.cluster_centers_,
+        columns=feature_names,
+        index=[f'Cluster {i}' for i in range(len(kmeans.cluster_centers_))]
+    )
+    st.dataframe(feature_importance)
+    st.markdown("""
+    **How to Interpret Positive and Negative Values:**
+    - **Positive Value:** The cluster center is positioned **above the mean** for that feature.
+      → The cluster tends to have **higher values** for that feature.
+    - **Negative Value:** The cluster center is positioned **below the mean** for that feature.
+      → The cluster tends to have **lower values** for that feature.
+    - **Magnitude:**
+        - Higher absolute values = Stronger influence of that feature in defining the cluster.
+        - Lower absolute values = Less influence of that feature in cluster formation.
+    """)
+# ------------- Model Prediction Tab -------------
+with tab3:
+    st.header("Predict Cluster for Custom Input")
+    # ✅ Collect user input for prediction
+    input_features = []
+    for feature in feature_names:
+        value = st.number_input(f"Enter {feature}", value=0.0, step=0.1)
+        input_features.append(value)
+    # ✅ Scale input data
+    input_scaled = StandardScaler().fit(X).transform([input_features])
+    if st.button("Predict Cluster"):
+        cluster = kmeans.predict(input_scaled)[0]
+        label = cluster_labels[cluster]
+        st.success(f"The predicted cluster is: **{label}**")
+    # ✅ Show cluster center distances with explanation
+    if st.checkbox("Show Cluster Distances"):
+        st.markdown("""
+        **What is Cluster Distance?**
+        - Cluster distance represents how close your custom input is to each cluster center.
+        - A smaller distance means your input is more similar to that cluster's typical values.
+        """)
+        distances = kmeans.transform(input_scaled)[0]
+        distance_df = pd.DataFrame(
+            distances,
+            index=[f'Cluster {i}' for i in range(len(distances))],
+            columns=["Distance"]
+        )
+        st.write(distance_df)
+        # ✅ Plot distances
+        fig, ax = plt.subplots()
+        sns.barplot(
+            x=distance_df.index,
+            y=distance_df["Distance"],
+            palette="viridis",
+            ax=ax
+        )
+        ax.set_title("Distance to Cluster Centers")
+        ax.set_ylabel("Distance")
+        st.pyplot(fig)
+# --------------------- Footer ---------------------
+st.markdown("---")
+st.write("**Awesome 😎**")