Spaces:

satya11
/

Natural_Language_Processing

Sleeping

App Files Files Community

Natural_Language_Processing / pages /8.Sample code.py

satya11

Create 8.Sample code.py

9cc268e verified 10 months ago

raw

history blame contribute delete

9.85 kB

	import streamlit as st
	import pandas as pd
	import matplotlib.pyplot as plt
	import seaborn as sns
	from sklearn.datasets import load_iris
	from sklearn.ensemble import VotingClassifier, BaggingClassifier, RandomForestClassifier
	from sklearn.linear_model import LogisticRegression
	from sklearn.tree import DecisionTreeClassifier
	from sklearn.neighbors import KNeighborsClassifier
	from sklearn.model_selection import train_test_split
	from sklearn.preprocessing import StandardScaler
	from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, precision_score, recall_score, f1_score

	# Set up Streamlit
	st.set_page_config(page_title="🧠 Explore Ensemble Learning", layout="wide")
	st.title("🧠 Ensemble Learning Playground")

	# ------------------------------------
	# Intro
	# ------------------------------------
	st.markdown("""
	## 🤝 What is Ensemble Learning?
	Ensemble Learning combines multiple machine learning models to improve overall performance and robustness.
	> ✨ "The wisdom of the crowd" — combining multiple opinions leads to smarter predictions!
	""")

	with st.expander("📚 Learn More About Ensemble Methods"):
	st.markdown("""
	### 🧠 Key Ensemble Methods Explained:
	- Voting Classifier: Combines predictions from multiple models (like Logistic Regression, Decision Tree, and KNN).
	- Hard voting: Picks the class with the most votes.
	- Soft voting: Averages predicted probabilities (requires models that support `predict_proba`).
	- Bagging (Bootstrap Aggregating): Trains the same model (e.g., Decision Tree) on different subsets of data and averages their outputs to reduce overfitting.
	- Random Forest: A special type of bagging using multiple decision trees with added randomness for better performance.
	""")

	# ------------------------------------
	# Load Dataset
	# ------------------------------------
	iris = load_iris()
	df = pd.DataFrame(iris.data, columns=iris.feature_names)
	df["target"] = iris.target
	df["species"] = df["target"].apply(lambda x: iris.target_names[x])

	# ------------------------------------
	# Dataset Exploration
	# ------------------------------------
	tab1, tab2, tab3 = st.tabs(["📋 Dataset", "📊 Visualizations", "📈 Statistics"])

	with tab1:
	st.subheader("🌼 Iris Dataset Preview")
	st.dataframe(df.head(), use_container_width=True)

	st.markdown("""
	Dataset Info:
	- 150 samples (50 per class)
	- 4 features (sepal length, sepal width, petal length, petal width)
	- 3 target classes (setosa, versicolor, virginica)
	""")

	with tab2:
	st.subheader("Feature Relationships")
	col1, col2 = st.columns(2)

	with col1:
	features = st.multiselect("Select two features", iris.feature_names, default=iris.feature_names[:2])
	if len(features) == 2:
	plt.figure(figsize=(8, 5))
	sns.scatterplot(data=df, x=features[0], y=features[1], hue="species", palette="viridis", s=80)
	plt.title(f"{features[0]} vs {features[1]}")
	st.pyplot(plt)
	plt.clf()

	with col2:
	feature = st.selectbox("Select feature for distribution", iris.feature_names)
	plt.figure(figsize=(8, 5))
	sns.boxplot(data=df, x="species", y=feature, palette="viridis")
	plt.title(f"Distribution of {feature} by species")
	st.pyplot(plt)
	plt.clf()

	with tab3:
	st.subheader("Dataset Statistics")
	st.dataframe(df.describe(), use_container_width=True)

	corr = df[iris.feature_names].corr()
	plt.figure(figsize=(8, 6))
	sns.heatmap(corr, annot=True, cmap="coolwarm", center=0)
	plt.title("Feature Correlation Matrix")
	st.pyplot(plt)
	plt.clf()

	# ------------------------------------
	# Sidebar for Model Selection
	# ------------------------------------
	st.sidebar.header("🔧 Model Configuration")
	ensemble_type = st.sidebar.selectbox("Choose Ensemble Method",
	["Voting", "Bagging", "Random Forest"],
	help="Select the ensemble learning technique to use")

	# Common parameters
	test_size = st.sidebar.slider("Test Set Size (%)", 10, 40, 20)
	random_state = st.sidebar.number_input("Random State", 0, 100, 42)

	# Prepare Data
	X = df[iris.feature_names]
	y = df["target"]
	X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size/100, random_state=random_state)

	scaler = StandardScaler()
	X_train_scaled = scaler.fit_transform(X_train)
	X_test_scaled = scaler.transform(X_test)

	# ------------------------------------
	# Model Configuration
	# ------------------------------------
	if ensemble_type == "Voting":
	st.sidebar.subheader("Voting Classifier Settings")
	voting_type = st.sidebar.radio("Voting Type", ["Hard", "Soft"])
	voting = "hard" if voting_type == "Hard" else "soft"

	# Initialize models
	clf1 = LogisticRegression(random_state=random_state)
	clf2 = DecisionTreeClassifier(random_state=random_state)
	clf3 = KNeighborsClassifier()

	model = VotingClassifier(estimators=[
	('lr', clf1),
	('dt', clf2),
	('knn', clf3)
	], voting=voting)

	elif ensemble_type == "Bagging":
	st.sidebar.subheader("Bagging Settings")
	n_estimators = st.sidebar.slider("Number of Estimators", 1, 100, 10)
	max_samples = st.sidebar.slider("Max Samples per Estimator", 0.1, 1.0, 1.0)

	base_model = DecisionTreeClassifier(random_state=random_state)
	model = BaggingClassifier(
	estimator=base_model,
	n_estimators=n_estimators,
	max_samples=max_samples,
	random_state=random_state
	)

	elif ensemble_type == "Random Forest":
	st.sidebar.subheader("Random Forest Settings")
	n_estimators = st.sidebar.slider("Number of Trees", 1, 200, 100)
	max_depth = st.sidebar.slider("Max Depth", 1, 20, None)
	min_samples_split = st.sidebar.slider("Min Samples Split", 2, 10, 2)

	model = RandomForestClassifier(
	n_estimators=n_estimators,
	max_depth=max_depth,
	min_samples_split=min_samples_split,
	random_state=random_state
	)

	# ------------------------------------
	# Model Training and Evaluation
	# ------------------------------------
	st.subheader(f"🚀 {ensemble_type} Classifier Performance")

	# Train model
	model.fit(X_train_scaled, y_train)
	y_pred = model.predict(X_test_scaled)

	# Metrics
	accuracy = accuracy_score(y_test, y_pred)
	precision = precision_score(y_test, y_pred, average='weighted')
	recall = recall_score(y_test, y_pred, average='weighted')
	f1 = f1_score(y_test, y_pred, average='weighted')

	# Display metrics
	col1, col2, col3, col4 = st.columns(4)
	col1.metric("Accuracy", f"{accuracy:.2%}")
	col2.metric("Precision", f"{precision:.2%}")
	col3.metric("Recall", f"{recall:.2%}")
	col4.metric("F1 Score", f"{f1:.2%}")

	# Detailed evaluation
	tab_eval1, tab_eval2 = st.tabs(["📝 Classification Report", "📊 Confusion Matrix"])

	with tab_eval1:
	st.text(classification_report(y_test, y_pred, target_names=iris.target_names))

	with tab_eval2:
	cm = confusion_matrix(y_test, y_pred)
	fig, ax = plt.subplots(figsize=(8, 6))
	sns.heatmap(cm, annot=True, fmt="d", cmap="Blues",
	xticklabels=iris.target_names,
	yticklabels=iris.target_names)
	plt.xlabel("Predicted")
	plt.ylabel("Actual")
	plt.title("Confusion Matrix")
	st.pyplot(fig)

	# Feature importance for Random Forest
	if ensemble_type == "Random Forest":
	st.subheader("🌳 Feature Importance")
	feature_importance = model.feature_importances_
	importance_df = pd.DataFrame({
	"Feature": iris.feature_names,
	"Importance": feature_importance
	}).sort_values("Importance", ascending=False)

	fig, ax = plt.subplots(figsize=(10, 5))
	sns.barplot(data=importance_df, x="Importance", y="Feature", palette="viridis")
	plt.title("Random Forest Feature Importance")
	st.pyplot(fig)

	# ------------------------------------
	# Prediction Playground
	# ------------------------------------
	st.subheader("🔮 Make Your Own Prediction")

	col1, col2, col3, col4 = st.columns(4)
	with col1:
	sepal_length = st.number_input("Sepal length (cm)", min_value=4.0, max_value=8.0, value=5.1)
	with col2:
	sepal_width = st.number_input("Sepal width (cm)", min_value=2.0, max_value=5.0, value=3.5)
	with col3:
	petal_length = st.number_input("Petal length (cm)", min_value=1.0, max_value=7.0, value=1.4)
	with col4:
	petal_width = st.number_input("Petal width (cm)", min_value=0.1, max_value=2.5, value=0.2)

	if st.button("Predict Species"):
	input_data = [[sepal_length, sepal_width, petal_length, petal_width]]
	input_scaled = scaler.transform(input_data)
	prediction = model.predict(input_scaled)[0]
	proba = model.predict_proba(input_scaled)[0] if hasattr(model, "predict_proba") else None

	st.success(f"Predicted Species: {iris.target_names[prediction]}")

	if proba is not None:
	st.write("Prediction Probabilities:")
	proba_df = pd.DataFrame({
	"Species": iris.target_names,
	"Probability": proba
	}).sort_values("Probability", ascending=False)
	st.dataframe(proba_df.style.format({"Probability": "{:.2%}"}), hide_index=True)

	# ------------------------------------
	# Final Summary
	# ------------------------------------
	st.markdown("""
	---
	## 📌 Summary
	- Best Model: {ensemble_type} with {accuracy:.2%} accuracy
	- Key Insights: {insight}

	> 🎯 Ensemble methods often outperform individual models by reducing variance and bias!
	""".format(
	ensemble_type=ensemble_type,
	accuracy=accuracy,
	insight="Feature importance shows petal measurements are most informative"
	if ensemble_type == "Random Forest"
	else "Combining multiple models leads to more robust predictions"
	))