satya11's picture
Create 8.Sample code.py
9cc268e verified
import streamlit as st
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.datasets import load_iris
from sklearn.ensemble import VotingClassifier, BaggingClassifier, RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, precision_score, recall_score, f1_score
# Set up Streamlit
st.set_page_config(page_title="๐Ÿง  Explore Ensemble Learning", layout="wide")
st.title("๐Ÿง  Ensemble Learning Playground")
# ------------------------------------
# Intro
# ------------------------------------
st.markdown("""
## ๐Ÿค What is Ensemble Learning?
Ensemble Learning combines multiple machine learning models to improve overall performance and robustness.
> โœจ "The wisdom of the crowd" โ€” combining multiple opinions leads to smarter predictions!
""")
with st.expander("๐Ÿ“š Learn More About Ensemble Methods"):
st.markdown("""
### ๐Ÿง  Key Ensemble Methods Explained:
- **Voting Classifier**: Combines predictions from multiple models (like Logistic Regression, Decision Tree, and KNN).
- *Hard voting*: Picks the class with the most votes.
- *Soft voting*: Averages predicted probabilities (requires models that support `predict_proba`).
- **Bagging (Bootstrap Aggregating)**: Trains the same model (e.g., Decision Tree) on different subsets of data and averages their outputs to reduce overfitting.
- **Random Forest**: A special type of bagging using multiple decision trees with added randomness for better performance.
""")
# ------------------------------------
# Load Dataset
# ------------------------------------
iris = load_iris()
df = pd.DataFrame(iris.data, columns=iris.feature_names)
df["target"] = iris.target
df["species"] = df["target"].apply(lambda x: iris.target_names[x])
# ------------------------------------
# Dataset Exploration
# ------------------------------------
tab1, tab2, tab3 = st.tabs(["๐Ÿ“‹ Dataset", "๐Ÿ“Š Visualizations", "๐Ÿ“ˆ Statistics"])
with tab1:
st.subheader("๐ŸŒผ Iris Dataset Preview")
st.dataframe(df.head(), use_container_width=True)
st.markdown("""
**Dataset Info:**
- 150 samples (50 per class)
- 4 features (sepal length, sepal width, petal length, petal width)
- 3 target classes (setosa, versicolor, virginica)
""")
with tab2:
st.subheader("Feature Relationships")
col1, col2 = st.columns(2)
with col1:
features = st.multiselect("Select two features", iris.feature_names, default=iris.feature_names[:2])
if len(features) == 2:
plt.figure(figsize=(8, 5))
sns.scatterplot(data=df, x=features[0], y=features[1], hue="species", palette="viridis", s=80)
plt.title(f"{features[0]} vs {features[1]}")
st.pyplot(plt)
plt.clf()
with col2:
feature = st.selectbox("Select feature for distribution", iris.feature_names)
plt.figure(figsize=(8, 5))
sns.boxplot(data=df, x="species", y=feature, palette="viridis")
plt.title(f"Distribution of {feature} by species")
st.pyplot(plt)
plt.clf()
with tab3:
st.subheader("Dataset Statistics")
st.dataframe(df.describe(), use_container_width=True)
corr = df[iris.feature_names].corr()
plt.figure(figsize=(8, 6))
sns.heatmap(corr, annot=True, cmap="coolwarm", center=0)
plt.title("Feature Correlation Matrix")
st.pyplot(plt)
plt.clf()
# ------------------------------------
# Sidebar for Model Selection
# ------------------------------------
st.sidebar.header("๐Ÿ”ง Model Configuration")
ensemble_type = st.sidebar.selectbox("Choose Ensemble Method",
["Voting", "Bagging", "Random Forest"],
help="Select the ensemble learning technique to use")
# Common parameters
test_size = st.sidebar.slider("Test Set Size (%)", 10, 40, 20)
random_state = st.sidebar.number_input("Random State", 0, 100, 42)
# Prepare Data
X = df[iris.feature_names]
y = df["target"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size/100, random_state=random_state)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
# ------------------------------------
# Model Configuration
# ------------------------------------
if ensemble_type == "Voting":
st.sidebar.subheader("Voting Classifier Settings")
voting_type = st.sidebar.radio("Voting Type", ["Hard", "Soft"])
voting = "hard" if voting_type == "Hard" else "soft"
# Initialize models
clf1 = LogisticRegression(random_state=random_state)
clf2 = DecisionTreeClassifier(random_state=random_state)
clf3 = KNeighborsClassifier()
model = VotingClassifier(estimators=[
('lr', clf1),
('dt', clf2),
('knn', clf3)
], voting=voting)
elif ensemble_type == "Bagging":
st.sidebar.subheader("Bagging Settings")
n_estimators = st.sidebar.slider("Number of Estimators", 1, 100, 10)
max_samples = st.sidebar.slider("Max Samples per Estimator", 0.1, 1.0, 1.0)
base_model = DecisionTreeClassifier(random_state=random_state)
model = BaggingClassifier(
estimator=base_model,
n_estimators=n_estimators,
max_samples=max_samples,
random_state=random_state
)
elif ensemble_type == "Random Forest":
st.sidebar.subheader("Random Forest Settings")
n_estimators = st.sidebar.slider("Number of Trees", 1, 200, 100)
max_depth = st.sidebar.slider("Max Depth", 1, 20, None)
min_samples_split = st.sidebar.slider("Min Samples Split", 2, 10, 2)
model = RandomForestClassifier(
n_estimators=n_estimators,
max_depth=max_depth,
min_samples_split=min_samples_split,
random_state=random_state
)
# ------------------------------------
# Model Training and Evaluation
# ------------------------------------
st.subheader(f"๐Ÿš€ {ensemble_type} Classifier Performance")
# Train model
model.fit(X_train_scaled, y_train)
y_pred = model.predict(X_test_scaled)
# Metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')
# Display metrics
col1, col2, col3, col4 = st.columns(4)
col1.metric("Accuracy", f"{accuracy:.2%}")
col2.metric("Precision", f"{precision:.2%}")
col3.metric("Recall", f"{recall:.2%}")
col4.metric("F1 Score", f"{f1:.2%}")
# Detailed evaluation
tab_eval1, tab_eval2 = st.tabs(["๐Ÿ“ Classification Report", "๐Ÿ“Š Confusion Matrix"])
with tab_eval1:
st.text(classification_report(y_test, y_pred, target_names=iris.target_names))
with tab_eval2:
cm = confusion_matrix(y_test, y_pred)
fig, ax = plt.subplots(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues",
xticklabels=iris.target_names,
yticklabels=iris.target_names)
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.title("Confusion Matrix")
st.pyplot(fig)
# Feature importance for Random Forest
if ensemble_type == "Random Forest":
st.subheader("๐ŸŒณ Feature Importance")
feature_importance = model.feature_importances_
importance_df = pd.DataFrame({
"Feature": iris.feature_names,
"Importance": feature_importance
}).sort_values("Importance", ascending=False)
fig, ax = plt.subplots(figsize=(10, 5))
sns.barplot(data=importance_df, x="Importance", y="Feature", palette="viridis")
plt.title("Random Forest Feature Importance")
st.pyplot(fig)
# ------------------------------------
# Prediction Playground
# ------------------------------------
st.subheader("๐Ÿ”ฎ Make Your Own Prediction")
col1, col2, col3, col4 = st.columns(4)
with col1:
sepal_length = st.number_input("Sepal length (cm)", min_value=4.0, max_value=8.0, value=5.1)
with col2:
sepal_width = st.number_input("Sepal width (cm)", min_value=2.0, max_value=5.0, value=3.5)
with col3:
petal_length = st.number_input("Petal length (cm)", min_value=1.0, max_value=7.0, value=1.4)
with col4:
petal_width = st.number_input("Petal width (cm)", min_value=0.1, max_value=2.5, value=0.2)
if st.button("Predict Species"):
input_data = [[sepal_length, sepal_width, petal_length, petal_width]]
input_scaled = scaler.transform(input_data)
prediction = model.predict(input_scaled)[0]
proba = model.predict_proba(input_scaled)[0] if hasattr(model, "predict_proba") else None
st.success(f"Predicted Species: **{iris.target_names[prediction]}**")
if proba is not None:
st.write("Prediction Probabilities:")
proba_df = pd.DataFrame({
"Species": iris.target_names,
"Probability": proba
}).sort_values("Probability", ascending=False)
st.dataframe(proba_df.style.format({"Probability": "{:.2%}"}), hide_index=True)
# ------------------------------------
# Final Summary
# ------------------------------------
st.markdown("""
---
## ๐Ÿ“Œ Summary
- **Best Model**: {ensemble_type} with {accuracy:.2%} accuracy
- **Key Insights**: {insight}
> ๐ŸŽฏ Ensemble methods often outperform individual models by reducing variance and bias!
""".format(
ensemble_type=ensemble_type,
accuracy=accuracy,
insight="Feature importance shows petal measurements are most informative"
if ensemble_type == "Random Forest"
else "Combining multiple models leads to more robust predictions"
))