Spaces:
Sleeping
Sleeping
| import streamlit as st | |
| import pandas as pd | |
| import matplotlib.pyplot as plt | |
| import seaborn as sns | |
| from sklearn.datasets import load_iris | |
| from sklearn.ensemble import VotingClassifier, BaggingClassifier, RandomForestClassifier | |
| from sklearn.linear_model import LogisticRegression | |
| from sklearn.tree import DecisionTreeClassifier | |
| from sklearn.neighbors import KNeighborsClassifier | |
| from sklearn.model_selection import train_test_split | |
| from sklearn.preprocessing import StandardScaler | |
| from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, precision_score, recall_score, f1_score | |
| # Set up Streamlit | |
| st.set_page_config(page_title="๐ง Explore Ensemble Learning", layout="wide") | |
| st.title("๐ง Ensemble Learning Playground") | |
| # ------------------------------------ | |
| # Intro | |
| # ------------------------------------ | |
| st.markdown(""" | |
| ## ๐ค What is Ensemble Learning? | |
| Ensemble Learning combines multiple machine learning models to improve overall performance and robustness. | |
| > โจ "The wisdom of the crowd" โ combining multiple opinions leads to smarter predictions! | |
| """) | |
| with st.expander("๐ Learn More About Ensemble Methods"): | |
| st.markdown(""" | |
| ### ๐ง Key Ensemble Methods Explained: | |
| - **Voting Classifier**: Combines predictions from multiple models (like Logistic Regression, Decision Tree, and KNN). | |
| - *Hard voting*: Picks the class with the most votes. | |
| - *Soft voting*: Averages predicted probabilities (requires models that support `predict_proba`). | |
| - **Bagging (Bootstrap Aggregating)**: Trains the same model (e.g., Decision Tree) on different subsets of data and averages their outputs to reduce overfitting. | |
| - **Random Forest**: A special type of bagging using multiple decision trees with added randomness for better performance. | |
| """) | |
| # ------------------------------------ | |
| # Load Dataset | |
| # ------------------------------------ | |
| iris = load_iris() | |
| df = pd.DataFrame(iris.data, columns=iris.feature_names) | |
| df["target"] = iris.target | |
| df["species"] = df["target"].apply(lambda x: iris.target_names[x]) | |
| # ------------------------------------ | |
| # Dataset Exploration | |
| # ------------------------------------ | |
| tab1, tab2, tab3 = st.tabs(["๐ Dataset", "๐ Visualizations", "๐ Statistics"]) | |
| with tab1: | |
| st.subheader("๐ผ Iris Dataset Preview") | |
| st.dataframe(df.head(), use_container_width=True) | |
| st.markdown(""" | |
| **Dataset Info:** | |
| - 150 samples (50 per class) | |
| - 4 features (sepal length, sepal width, petal length, petal width) | |
| - 3 target classes (setosa, versicolor, virginica) | |
| """) | |
| with tab2: | |
| st.subheader("Feature Relationships") | |
| col1, col2 = st.columns(2) | |
| with col1: | |
| features = st.multiselect("Select two features", iris.feature_names, default=iris.feature_names[:2]) | |
| if len(features) == 2: | |
| plt.figure(figsize=(8, 5)) | |
| sns.scatterplot(data=df, x=features[0], y=features[1], hue="species", palette="viridis", s=80) | |
| plt.title(f"{features[0]} vs {features[1]}") | |
| st.pyplot(plt) | |
| plt.clf() | |
| with col2: | |
| feature = st.selectbox("Select feature for distribution", iris.feature_names) | |
| plt.figure(figsize=(8, 5)) | |
| sns.boxplot(data=df, x="species", y=feature, palette="viridis") | |
| plt.title(f"Distribution of {feature} by species") | |
| st.pyplot(plt) | |
| plt.clf() | |
| with tab3: | |
| st.subheader("Dataset Statistics") | |
| st.dataframe(df.describe(), use_container_width=True) | |
| corr = df[iris.feature_names].corr() | |
| plt.figure(figsize=(8, 6)) | |
| sns.heatmap(corr, annot=True, cmap="coolwarm", center=0) | |
| plt.title("Feature Correlation Matrix") | |
| st.pyplot(plt) | |
| plt.clf() | |
| # ------------------------------------ | |
| # Sidebar for Model Selection | |
| # ------------------------------------ | |
| st.sidebar.header("๐ง Model Configuration") | |
| ensemble_type = st.sidebar.selectbox("Choose Ensemble Method", | |
| ["Voting", "Bagging", "Random Forest"], | |
| help="Select the ensemble learning technique to use") | |
| # Common parameters | |
| test_size = st.sidebar.slider("Test Set Size (%)", 10, 40, 20) | |
| random_state = st.sidebar.number_input("Random State", 0, 100, 42) | |
| # Prepare Data | |
| X = df[iris.feature_names] | |
| y = df["target"] | |
| X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size/100, random_state=random_state) | |
| scaler = StandardScaler() | |
| X_train_scaled = scaler.fit_transform(X_train) | |
| X_test_scaled = scaler.transform(X_test) | |
| # ------------------------------------ | |
| # Model Configuration | |
| # ------------------------------------ | |
| if ensemble_type == "Voting": | |
| st.sidebar.subheader("Voting Classifier Settings") | |
| voting_type = st.sidebar.radio("Voting Type", ["Hard", "Soft"]) | |
| voting = "hard" if voting_type == "Hard" else "soft" | |
| # Initialize models | |
| clf1 = LogisticRegression(random_state=random_state) | |
| clf2 = DecisionTreeClassifier(random_state=random_state) | |
| clf3 = KNeighborsClassifier() | |
| model = VotingClassifier(estimators=[ | |
| ('lr', clf1), | |
| ('dt', clf2), | |
| ('knn', clf3) | |
| ], voting=voting) | |
| elif ensemble_type == "Bagging": | |
| st.sidebar.subheader("Bagging Settings") | |
| n_estimators = st.sidebar.slider("Number of Estimators", 1, 100, 10) | |
| max_samples = st.sidebar.slider("Max Samples per Estimator", 0.1, 1.0, 1.0) | |
| base_model = DecisionTreeClassifier(random_state=random_state) | |
| model = BaggingClassifier( | |
| estimator=base_model, | |
| n_estimators=n_estimators, | |
| max_samples=max_samples, | |
| random_state=random_state | |
| ) | |
| elif ensemble_type == "Random Forest": | |
| st.sidebar.subheader("Random Forest Settings") | |
| n_estimators = st.sidebar.slider("Number of Trees", 1, 200, 100) | |
| max_depth = st.sidebar.slider("Max Depth", 1, 20, None) | |
| min_samples_split = st.sidebar.slider("Min Samples Split", 2, 10, 2) | |
| model = RandomForestClassifier( | |
| n_estimators=n_estimators, | |
| max_depth=max_depth, | |
| min_samples_split=min_samples_split, | |
| random_state=random_state | |
| ) | |
| # ------------------------------------ | |
| # Model Training and Evaluation | |
| # ------------------------------------ | |
| st.subheader(f"๐ {ensemble_type} Classifier Performance") | |
| # Train model | |
| model.fit(X_train_scaled, y_train) | |
| y_pred = model.predict(X_test_scaled) | |
| # Metrics | |
| accuracy = accuracy_score(y_test, y_pred) | |
| precision = precision_score(y_test, y_pred, average='weighted') | |
| recall = recall_score(y_test, y_pred, average='weighted') | |
| f1 = f1_score(y_test, y_pred, average='weighted') | |
| # Display metrics | |
| col1, col2, col3, col4 = st.columns(4) | |
| col1.metric("Accuracy", f"{accuracy:.2%}") | |
| col2.metric("Precision", f"{precision:.2%}") | |
| col3.metric("Recall", f"{recall:.2%}") | |
| col4.metric("F1 Score", f"{f1:.2%}") | |
| # Detailed evaluation | |
| tab_eval1, tab_eval2 = st.tabs(["๐ Classification Report", "๐ Confusion Matrix"]) | |
| with tab_eval1: | |
| st.text(classification_report(y_test, y_pred, target_names=iris.target_names)) | |
| with tab_eval2: | |
| cm = confusion_matrix(y_test, y_pred) | |
| fig, ax = plt.subplots(figsize=(8, 6)) | |
| sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", | |
| xticklabels=iris.target_names, | |
| yticklabels=iris.target_names) | |
| plt.xlabel("Predicted") | |
| plt.ylabel("Actual") | |
| plt.title("Confusion Matrix") | |
| st.pyplot(fig) | |
| # Feature importance for Random Forest | |
| if ensemble_type == "Random Forest": | |
| st.subheader("๐ณ Feature Importance") | |
| feature_importance = model.feature_importances_ | |
| importance_df = pd.DataFrame({ | |
| "Feature": iris.feature_names, | |
| "Importance": feature_importance | |
| }).sort_values("Importance", ascending=False) | |
| fig, ax = plt.subplots(figsize=(10, 5)) | |
| sns.barplot(data=importance_df, x="Importance", y="Feature", palette="viridis") | |
| plt.title("Random Forest Feature Importance") | |
| st.pyplot(fig) | |
| # ------------------------------------ | |
| # Prediction Playground | |
| # ------------------------------------ | |
| st.subheader("๐ฎ Make Your Own Prediction") | |
| col1, col2, col3, col4 = st.columns(4) | |
| with col1: | |
| sepal_length = st.number_input("Sepal length (cm)", min_value=4.0, max_value=8.0, value=5.1) | |
| with col2: | |
| sepal_width = st.number_input("Sepal width (cm)", min_value=2.0, max_value=5.0, value=3.5) | |
| with col3: | |
| petal_length = st.number_input("Petal length (cm)", min_value=1.0, max_value=7.0, value=1.4) | |
| with col4: | |
| petal_width = st.number_input("Petal width (cm)", min_value=0.1, max_value=2.5, value=0.2) | |
| if st.button("Predict Species"): | |
| input_data = [[sepal_length, sepal_width, petal_length, petal_width]] | |
| input_scaled = scaler.transform(input_data) | |
| prediction = model.predict(input_scaled)[0] | |
| proba = model.predict_proba(input_scaled)[0] if hasattr(model, "predict_proba") else None | |
| st.success(f"Predicted Species: **{iris.target_names[prediction]}**") | |
| if proba is not None: | |
| st.write("Prediction Probabilities:") | |
| proba_df = pd.DataFrame({ | |
| "Species": iris.target_names, | |
| "Probability": proba | |
| }).sort_values("Probability", ascending=False) | |
| st.dataframe(proba_df.style.format({"Probability": "{:.2%}"}), hide_index=True) | |
| # ------------------------------------ | |
| # Final Summary | |
| # ------------------------------------ | |
| st.markdown(""" | |
| --- | |
| ## ๐ Summary | |
| - **Best Model**: {ensemble_type} with {accuracy:.2%} accuracy | |
| - **Key Insights**: {insight} | |
| > ๐ฏ Ensemble methods often outperform individual models by reducing variance and bias! | |
| """.format( | |
| ensemble_type=ensemble_type, | |
| accuracy=accuracy, | |
| insight="Feature importance shows petal measurements are most informative" | |
| if ensemble_type == "Random Forest" | |
| else "Combining multiple models leads to more robust predictions" | |
| )) |