import streamlit as st
from sklearn.datasets import load_iris
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# --------------------- Streamlit App ---------------------
st.set_page_config(page_title="Unsupervised ML: Iris Clustering", layout="wide")

# Load and preprocess the Iris dataset
@st.cache_data
def load_data():
    iris = load_iris()
    X = iris.data
    feature_names = iris.feature_names

    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)

    return X, X_scaled, feature_names

X, X_scaled, feature_names = load_data()

# Perform K-Means clustering
@st.cache_data
def perform_clustering(X_scaled, n_clusters=3):
    kmeans = KMeans(n_clusters=n_clusters, random_state=42)
    clusters = kmeans.fit_predict(X_scaled)
    return kmeans, clusters

kmeans, clusters = perform_clustering(X_scaled)

# Create a DataFrame with the clustering results
@st.cache_data
def create_clustered_dataframe(X, clusters, feature_names):
    df = pd.DataFrame(X, columns=feature_names)
    df['Cluster'] = clusters

    # Assign meaningful labels to clusters based on analysis
    cluster_labels = {0: 'Setosa-like', 1: 'Versicolor-like', 2: 'Virginica-like'}
    df['Cluster Label'] = df['Cluster'].map(cluster_labels)

    return df, cluster_labels

df, cluster_labels = create_clustered_dataframe(X, clusters, feature_names)


# ✅ App Title
st.title("🌸 Unsupervised Machine Learning: Iris Clustering App")

# Tabs for organization
tab1, tab2, tab3 = st.tabs(["🏠 About", "📊 Data Visualization", "🔎 Model Prediction"])

# ------------- About Tab -------------
with tab1:
    st.header("About This App")
    st.markdown("""
        ## **Overview**
        This application demonstrates **unsupervised machine learning** using the Iris dataset.
        The app clusters data points based on the features of iris flowers using the **K-Means clustering algorithm**.
        After clustering, meaningful labels are assigned based on the cluster’s statistical properties.
    
        ## **How It Works**
        1. **Data Preprocessing:**
            - The dataset is standardized using `StandardScaler` to ensure uniform feature scaling.
    
        2. **Clustering:**
            - K-Means clustering is applied to group the data into **three clusters**.
            - The number of clusters is based on the natural grouping of the Iris dataset.
    
        3. **Cluster Labeling:**
            - After clustering, each cluster is assigned a meaningful label based on its centroid properties and domain knowledge.
    
        4. **Model Testing:**
            - The app allows the user to enter custom feature values.
            - The model predicts the cluster and assigns a meaningful label to the input data.
    
        ## **Dataset Information**
        """)
    st.write(pd.DataFrame(load_iris()['data'], columns=load_iris()['feature_names']).head())
    st.markdown("""
        The Iris dataset contains **150 samples** of iris flowers.
        Each sample includes the following features:
        - 🌸 Sepal Length (cm)
        - 🌸 Sepal Width (cm)
        - 🌸 Petal Length (cm)
        - 🌸 Petal Width (cm)
    
        The goal of clustering is to find natural patterns among these measurements.
    """)

# ------------- Data Visualization Tab -------------
with tab2:
    st.header("Data Visualization")

    # ✅ Cluster distribution plot
    st.subheader("Cluster Distribution")
    fig, ax = plt.subplots()
    sns.scatterplot(
        x=df['sepal length (cm)'],
        y=df['sepal width (cm)'],
        hue=df['Cluster Label'],
        palette='viridis',
        s=100,
        alpha=0.7,
        ax=ax
    )
    plt.xlabel('Sepal Length (cm)')
    plt.ylabel('Sepal Width (cm)')
    st.pyplot(fig)

    # ✅ Heatmap (Fixed by dropping non-numeric columns)
    st.subheader("Heatmap of Feature Correlation")
    numeric_df = df.drop(columns=["Cluster", "Cluster Label"])  # Drop non-numeric columns
    fig, ax = plt.subplots(figsize=(6, 4))
    sns.heatmap(numeric_df.corr(), annot=True, cmap="coolwarm", fmt=".2f", ax=ax)
    st.pyplot(fig)

    # ✅ Box plots (Replaced pair plot for better clarity)
    st.subheader("Box Plot of Features by Cluster")
    fig, ax = plt.subplots(figsize=(10, 6))
    sns.boxplot(x='Cluster Label', y='sepal length (cm)', data=df, palette='viridis', ax=ax)
    plt.title("Sepal Length Distribution Across Clusters")
    st.pyplot(fig)

    fig, ax = plt.subplots(figsize=(10, 6))
    sns.boxplot(x='Cluster Label', y='petal length (cm)', data=df, palette='viridis', ax=ax)
    plt.title("Petal Length Distribution Across Clusters")
    st.pyplot(fig)

    # ✅ Feature importance (Tabular format with explanation)
    st.subheader("Feature Importance (Based on Cluster Centers)")
    feature_importance = pd.DataFrame(
        kmeans.cluster_centers_,
        columns=feature_names,
        index=[f'Cluster {i}' for i in range(len(kmeans.cluster_centers_))]
    )
    st.dataframe(feature_importance)

    st.markdown("""
    **How to Interpret Positive and Negative Values:**
    - **Positive Value:** The cluster center is positioned **above the mean** for that feature.
      → The cluster tends to have **higher values** for that feature.
    - **Negative Value:** The cluster center is positioned **below the mean** for that feature.
      → The cluster tends to have **lower values** for that feature.
    - **Magnitude:**
        - Higher absolute values = Stronger influence of that feature in defining the cluster.
        - Lower absolute values = Less influence of that feature in cluster formation.
    """)

# ------------- Model Prediction Tab -------------
with tab3:
    st.header("Predict Cluster for Custom Input")

    # ✅ Collect user input for prediction
    input_features = []
    for feature in feature_names:
        value = st.number_input(f"Enter {feature}", value=0.0, step=0.1)
        input_features.append(value)

    # ✅ Scale input data
    input_scaled = StandardScaler().fit(X).transform([input_features])

    if st.button("Predict Cluster"):
        cluster = kmeans.predict(input_scaled)[0]
        label = cluster_labels[cluster]
        st.success(f"The predicted cluster is: **{label}**")

    # ✅ Show cluster center distances with explanation
    if st.checkbox("Show Cluster Distances"):
        st.markdown("""
        **What is Cluster Distance?**
        - Cluster distance represents how close your custom input is to each cluster center.
        - A smaller distance means your input is more similar to that cluster's typical values.
        """)

        distances = kmeans.transform(input_scaled)[0]
        distance_df = pd.DataFrame(
            distances,
            index=[f'Cluster {i}' for i in range(len(distances))],
            columns=["Distance"]
        )
        st.write(distance_df)

        # ✅ Plot distances
        fig, ax = plt.subplots()
        sns.barplot(
            x=distance_df.index,
            y=distance_df["Distance"],
            palette="viridis",
            ax=ax
        )
        ax.set_title("Distance to Cluster Centers")
        ax.set_ylabel("Distance")
        st.pyplot(fig)

# --------------------- Footer ---------------------
st.markdown("---")
st.write("**Awesome 😎**")