import streamlit as st import pandas as pd import matplotlib.pyplot as plt import seaborn as sns from sklearn.datasets import load_iris from sklearn.ensemble import VotingClassifier, BaggingClassifier, RandomForestClassifier from sklearn.linear_model import LogisticRegression from sklearn.tree import DecisionTreeClassifier from sklearn.neighbors import KNeighborsClassifier from sklearn.model_selection import train_test_split from sklearn.preprocessing import StandardScaler from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, precision_score, recall_score, f1_score # Set up Streamlit st.set_page_config(page_title="🧠 Explore Ensemble Learning", layout="wide") st.title("🧠 Ensemble Learning Playground") # ------------------------------------ # Intro # ------------------------------------ st.markdown(""" ## 🤝 What is Ensemble Learning? Ensemble Learning combines multiple machine learning models to improve overall performance and robustness. > ✨ "The wisdom of the crowd" — combining multiple opinions leads to smarter predictions! """) with st.expander("📚 Learn More About Ensemble Methods"): st.markdown(""" ### 🧠 Key Ensemble Methods Explained: - **Voting Classifier**: Combines predictions from multiple models (like Logistic Regression, Decision Tree, and KNN). - *Hard voting*: Picks the class with the most votes. - *Soft voting*: Averages predicted probabilities (requires models that support `predict_proba`). - **Bagging (Bootstrap Aggregating)**: Trains the same model (e.g., Decision Tree) on different subsets of data and averages their outputs to reduce overfitting. - **Random Forest**: A special type of bagging using multiple decision trees with added randomness for better performance. """) # ------------------------------------ # Load Dataset # ------------------------------------ iris = load_iris() df = pd.DataFrame(iris.data, columns=iris.feature_names) df["target"] = iris.target df["species"] = df["target"].apply(lambda x: iris.target_names[x]) # ------------------------------------ # Dataset Exploration # ------------------------------------ tab1, tab2, tab3 = st.tabs(["📋 Dataset", "📊 Visualizations", "📈 Statistics"]) with tab1: st.subheader("🌼 Iris Dataset Preview") st.dataframe(df.head(), use_container_width=True) st.markdown(""" **Dataset Info:** - 150 samples (50 per class) - 4 features (sepal length, sepal width, petal length, petal width) - 3 target classes (setosa, versicolor, virginica) """) with tab2: st.subheader("Feature Relationships") col1, col2 = st.columns(2) with col1: features = st.multiselect("Select two features", iris.feature_names, default=iris.feature_names[:2]) if len(features) == 2: plt.figure(figsize=(8, 5)) sns.scatterplot(data=df, x=features[0], y=features[1], hue="species", palette="viridis", s=80) plt.title(f"{features[0]} vs {features[1]}") st.pyplot(plt) plt.clf() with col2: feature = st.selectbox("Select feature for distribution", iris.feature_names) plt.figure(figsize=(8, 5)) sns.boxplot(data=df, x="species", y=feature, palette="viridis") plt.title(f"Distribution of {feature} by species") st.pyplot(plt) plt.clf() with tab3: st.subheader("Dataset Statistics") st.dataframe(df.describe(), use_container_width=True) corr = df[iris.feature_names].corr() plt.figure(figsize=(8, 6)) sns.heatmap(corr, annot=True, cmap="coolwarm", center=0) plt.title("Feature Correlation Matrix") st.pyplot(plt) plt.clf() # ------------------------------------ # Sidebar for Model Selection # ------------------------------------ st.sidebar.header("🔧 Model Configuration") ensemble_type = st.sidebar.selectbox("Choose Ensemble Method", ["Voting", "Bagging", "Random Forest"], help="Select the ensemble learning technique to use") # Common parameters test_size = st.sidebar.slider("Test Set Size (%)", 10, 40, 20) random_state = st.sidebar.number_input("Random State", 0, 100, 42) # Prepare Data X = df[iris.feature_names] y = df["target"] X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size/100, random_state=random_state) scaler = StandardScaler() X_train_scaled = scaler.fit_transform(X_train) X_test_scaled = scaler.transform(X_test) # ------------------------------------ # Model Configuration # ------------------------------------ if ensemble_type == "Voting": st.sidebar.subheader("Voting Classifier Settings") voting_type = st.sidebar.radio("Voting Type", ["Hard", "Soft"]) voting = "hard" if voting_type == "Hard" else "soft" # Initialize models clf1 = LogisticRegression(random_state=random_state) clf2 = DecisionTreeClassifier(random_state=random_state) clf3 = KNeighborsClassifier() model = VotingClassifier(estimators=[ ('lr', clf1), ('dt', clf2), ('knn', clf3) ], voting=voting) elif ensemble_type == "Bagging": st.sidebar.subheader("Bagging Settings") n_estimators = st.sidebar.slider("Number of Estimators", 1, 100, 10) max_samples = st.sidebar.slider("Max Samples per Estimator", 0.1, 1.0, 1.0) base_model = DecisionTreeClassifier(random_state=random_state) model = BaggingClassifier( estimator=base_model, n_estimators=n_estimators, max_samples=max_samples, random_state=random_state ) elif ensemble_type == "Random Forest": st.sidebar.subheader("Random Forest Settings") n_estimators = st.sidebar.slider("Number of Trees", 1, 200, 100) max_depth = st.sidebar.slider("Max Depth", 1, 20, None) min_samples_split = st.sidebar.slider("Min Samples Split", 2, 10, 2) model = RandomForestClassifier( n_estimators=n_estimators, max_depth=max_depth, min_samples_split=min_samples_split, random_state=random_state ) # ------------------------------------ # Model Training and Evaluation # ------------------------------------ st.subheader(f"🚀 {ensemble_type} Classifier Performance") # Train model model.fit(X_train_scaled, y_train) y_pred = model.predict(X_test_scaled) # Metrics accuracy = accuracy_score(y_test, y_pred) precision = precision_score(y_test, y_pred, average='weighted') recall = recall_score(y_test, y_pred, average='weighted') f1 = f1_score(y_test, y_pred, average='weighted') # Display metrics col1, col2, col3, col4 = st.columns(4) col1.metric("Accuracy", f"{accuracy:.2%}") col2.metric("Precision", f"{precision:.2%}") col3.metric("Recall", f"{recall:.2%}") col4.metric("F1 Score", f"{f1:.2%}") # Detailed evaluation tab_eval1, tab_eval2 = st.tabs(["📝 Classification Report", "📊 Confusion Matrix"]) with tab_eval1: st.text(classification_report(y_test, y_pred, target_names=iris.target_names)) with tab_eval2: cm = confusion_matrix(y_test, y_pred) fig, ax = plt.subplots(figsize=(8, 6)) sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", xticklabels=iris.target_names, yticklabels=iris.target_names) plt.xlabel("Predicted") plt.ylabel("Actual") plt.title("Confusion Matrix") st.pyplot(fig) # Feature importance for Random Forest if ensemble_type == "Random Forest": st.subheader("🌳 Feature Importance") feature_importance = model.feature_importances_ importance_df = pd.DataFrame({ "Feature": iris.feature_names, "Importance": feature_importance }).sort_values("Importance", ascending=False) fig, ax = plt.subplots(figsize=(10, 5)) sns.barplot(data=importance_df, x="Importance", y="Feature", palette="viridis") plt.title("Random Forest Feature Importance") st.pyplot(fig) # ------------------------------------ # Prediction Playground # ------------------------------------ st.subheader("🔮 Make Your Own Prediction") col1, col2, col3, col4 = st.columns(4) with col1: sepal_length = st.number_input("Sepal length (cm)", min_value=4.0, max_value=8.0, value=5.1) with col2: sepal_width = st.number_input("Sepal width (cm)", min_value=2.0, max_value=5.0, value=3.5) with col3: petal_length = st.number_input("Petal length (cm)", min_value=1.0, max_value=7.0, value=1.4) with col4: petal_width = st.number_input("Petal width (cm)", min_value=0.1, max_value=2.5, value=0.2) if st.button("Predict Species"): input_data = [[sepal_length, sepal_width, petal_length, petal_width]] input_scaled = scaler.transform(input_data) prediction = model.predict(input_scaled)[0] proba = model.predict_proba(input_scaled)[0] if hasattr(model, "predict_proba") else None st.success(f"Predicted Species: **{iris.target_names[prediction]}**") if proba is not None: st.write("Prediction Probabilities:") proba_df = pd.DataFrame({ "Species": iris.target_names, "Probability": proba }).sort_values("Probability", ascending=False) st.dataframe(proba_df.style.format({"Probability": "{:.2%}"}), hide_index=True) # ------------------------------------ # Final Summary # ------------------------------------ st.markdown(""" --- ## 📌 Summary - **Best Model**: {ensemble_type} with {accuracy:.2%} accuracy - **Key Insights**: {insight} > 🎯 Ensemble methods often outperform individual models by reducing variance and bias! """.format( ensemble_type=ensemble_type, accuracy=accuracy, insight="Feature importance shows petal measurements are most informative" if ensemble_type == "Random Forest" else "Combining multiple models leads to more robust predictions" ))