Spaces:

satya11
/

Natural_Language_Processing

Sleeping

File size: 9,845 Bytes

9cc268e

import streamlit as st
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.datasets import load_iris
from sklearn.ensemble import VotingClassifier, BaggingClassifier, RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, precision_score, recall_score, f1_score

# Set up Streamlit
st.set_page_config(page_title="🧠 Explore Ensemble Learning", layout="wide")
st.title("🧠 Ensemble Learning Playground")

# ------------------------------------
# Intro
# ------------------------------------
st.markdown("""
## 🤝 What is Ensemble Learning?
Ensemble Learning combines multiple machine learning models to improve overall performance and robustness.
> ✨ "The wisdom of the crowd" — combining multiple opinions leads to smarter predictions!
""")

with st.expander("📚 Learn More About Ensemble Methods"):
    st.markdown("""
    ### 🧠 Key Ensemble Methods Explained:
    - **Voting Classifier**: Combines predictions from multiple models (like Logistic Regression, Decision Tree, and KNN).  
      - *Hard voting*: Picks the class with the most votes.
      - *Soft voting*: Averages predicted probabilities (requires models that support `predict_proba`).
    - **Bagging (Bootstrap Aggregating)**: Trains the same model (e.g., Decision Tree) on different subsets of data and averages their outputs to reduce overfitting.
    - **Random Forest**: A special type of bagging using multiple decision trees with added randomness for better performance.
    """)

# ------------------------------------
# Load Dataset
# ------------------------------------
iris = load_iris()
df = pd.DataFrame(iris.data, columns=iris.feature_names)
df["target"] = iris.target
df["species"] = df["target"].apply(lambda x: iris.target_names[x])

# ------------------------------------
# Dataset Exploration
# ------------------------------------
tab1, tab2, tab3 = st.tabs(["📋 Dataset", "📊 Visualizations", "📈 Statistics"])

with tab1:
    st.subheader("🌼 Iris Dataset Preview")
    st.dataframe(df.head(), use_container_width=True)
    
    st.markdown("""
    **Dataset Info:**
    - 150 samples (50 per class)
    - 4 features (sepal length, sepal width, petal length, petal width)
    - 3 target classes (setosa, versicolor, virginica)
    """)

with tab2:
    st.subheader("Feature Relationships")
    col1, col2 = st.columns(2)
    
    with col1:
        features = st.multiselect("Select two features", iris.feature_names, default=iris.feature_names[:2])
        if len(features) == 2:
            plt.figure(figsize=(8, 5))
            sns.scatterplot(data=df, x=features[0], y=features[1], hue="species", palette="viridis", s=80)
            plt.title(f"{features[0]} vs {features[1]}")
            st.pyplot(plt)
            plt.clf()
    
    with col2:
        feature = st.selectbox("Select feature for distribution", iris.feature_names)
        plt.figure(figsize=(8, 5))
        sns.boxplot(data=df, x="species", y=feature, palette="viridis")
        plt.title(f"Distribution of {feature} by species")
        st.pyplot(plt)
        plt.clf()

with tab3:
    st.subheader("Dataset Statistics")
    st.dataframe(df.describe(), use_container_width=True)
    
    corr = df[iris.feature_names].corr()
    plt.figure(figsize=(8, 6))
    sns.heatmap(corr, annot=True, cmap="coolwarm", center=0)
    plt.title("Feature Correlation Matrix")
    st.pyplot(plt)
    plt.clf()

# ------------------------------------
# Sidebar for Model Selection
# ------------------------------------
st.sidebar.header("🔧 Model Configuration")
ensemble_type = st.sidebar.selectbox("Choose Ensemble Method", 
                                    ["Voting", "Bagging", "Random Forest"],
                                    help="Select the ensemble learning technique to use")

# Common parameters
test_size = st.sidebar.slider("Test Set Size (%)", 10, 40, 20)
random_state = st.sidebar.number_input("Random State", 0, 100, 42)

# Prepare Data
X = df[iris.feature_names]
y = df["target"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size/100, random_state=random_state)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# ------------------------------------
# Model Configuration
# ------------------------------------
if ensemble_type == "Voting":
    st.sidebar.subheader("Voting Classifier Settings")
    voting_type = st.sidebar.radio("Voting Type", ["Hard", "Soft"])
    voting = "hard" if voting_type == "Hard" else "soft"
    
    # Initialize models
    clf1 = LogisticRegression(random_state=random_state)
    clf2 = DecisionTreeClassifier(random_state=random_state)
    clf3 = KNeighborsClassifier()
    
    model = VotingClassifier(estimators=[
        ('lr', clf1), 
        ('dt', clf2), 
        ('knn', clf3)
    ], voting=voting)

elif ensemble_type == "Bagging":
    st.sidebar.subheader("Bagging Settings")
    n_estimators = st.sidebar.slider("Number of Estimators", 1, 100, 10)
    max_samples = st.sidebar.slider("Max Samples per Estimator", 0.1, 1.0, 1.0)
    
    base_model = DecisionTreeClassifier(random_state=random_state)
    model = BaggingClassifier(
        estimator=base_model,
        n_estimators=n_estimators,
        max_samples=max_samples,
        random_state=random_state
    )

elif ensemble_type == "Random Forest":
    st.sidebar.subheader("Random Forest Settings")
    n_estimators = st.sidebar.slider("Number of Trees", 1, 200, 100)
    max_depth = st.sidebar.slider("Max Depth", 1, 20, None)
    min_samples_split = st.sidebar.slider("Min Samples Split", 2, 10, 2)
    
    model = RandomForestClassifier(
        n_estimators=n_estimators,
        max_depth=max_depth,
        min_samples_split=min_samples_split,
        random_state=random_state
    )

# ------------------------------------
# Model Training and Evaluation
# ------------------------------------
st.subheader(f"🚀 {ensemble_type} Classifier Performance")

# Train model
model.fit(X_train_scaled, y_train)
y_pred = model.predict(X_test_scaled)

# Metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')

# Display metrics
col1, col2, col3, col4 = st.columns(4)
col1.metric("Accuracy", f"{accuracy:.2%}")
col2.metric("Precision", f"{precision:.2%}")
col3.metric("Recall", f"{recall:.2%}")
col4.metric("F1 Score", f"{f1:.2%}")

# Detailed evaluation
tab_eval1, tab_eval2 = st.tabs(["📝 Classification Report", "📊 Confusion Matrix"])

with tab_eval1:
    st.text(classification_report(y_test, y_pred, target_names=iris.target_names))

with tab_eval2:
    cm = confusion_matrix(y_test, y_pred)
    fig, ax = plt.subplots(figsize=(8, 6))
    sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", 
                xticklabels=iris.target_names, 
                yticklabels=iris.target_names)
    plt.xlabel("Predicted")
    plt.ylabel("Actual")
    plt.title("Confusion Matrix")
    st.pyplot(fig)

# Feature importance for Random Forest
if ensemble_type == "Random Forest":
    st.subheader("🌳 Feature Importance")
    feature_importance = model.feature_importances_
    importance_df = pd.DataFrame({
        "Feature": iris.feature_names,
        "Importance": feature_importance
    }).sort_values("Importance", ascending=False)
    
    fig, ax = plt.subplots(figsize=(10, 5))
    sns.barplot(data=importance_df, x="Importance", y="Feature", palette="viridis")
    plt.title("Random Forest Feature Importance")
    st.pyplot(fig)

# ------------------------------------
# Prediction Playground
# ------------------------------------
st.subheader("🔮 Make Your Own Prediction")

col1, col2, col3, col4 = st.columns(4)
with col1:
    sepal_length = st.number_input("Sepal length (cm)", min_value=4.0, max_value=8.0, value=5.1)
with col2:
    sepal_width = st.number_input("Sepal width (cm)", min_value=2.0, max_value=5.0, value=3.5)
with col3:
    petal_length = st.number_input("Petal length (cm)", min_value=1.0, max_value=7.0, value=1.4)
with col4:
    petal_width = st.number_input("Petal width (cm)", min_value=0.1, max_value=2.5, value=0.2)

if st.button("Predict Species"):
    input_data = [[sepal_length, sepal_width, petal_length, petal_width]]
    input_scaled = scaler.transform(input_data)
    prediction = model.predict(input_scaled)[0]
    proba = model.predict_proba(input_scaled)[0] if hasattr(model, "predict_proba") else None
    
    st.success(f"Predicted Species: **{iris.target_names[prediction]}**")
    
    if proba is not None:
        st.write("Prediction Probabilities:")
        proba_df = pd.DataFrame({
            "Species": iris.target_names,
            "Probability": proba
        }).sort_values("Probability", ascending=False)
        st.dataframe(proba_df.style.format({"Probability": "{:.2%}"}), hide_index=True)

# ------------------------------------
# Final Summary
# ------------------------------------
st.markdown("""
---
## 📌 Summary
- **Best Model**: {ensemble_type} with {accuracy:.2%} accuracy
- **Key Insights**: {insight}

> 🎯 Ensemble methods often outperform individual models by reducing variance and bias!
""".format(
    ensemble_type=ensemble_type,
    accuracy=accuracy,
    insight="Feature importance shows petal measurements are most informative" 
    if ensemble_type == "Random Forest" 
    else "Combining multiple models leads to more robust predictions"
))