satya11 commited on
Commit
9cc268e
ยท
verified ยท
1 Parent(s): 18f4719

Create 8.Sample code.py

Browse files
Files changed (1) hide show
  1. pages/8.Sample code.py +259 -0
pages/8.Sample code.py ADDED
@@ -0,0 +1,259 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import pandas as pd
3
+ import matplotlib.pyplot as plt
4
+ import seaborn as sns
5
+ from sklearn.datasets import load_iris
6
+ from sklearn.ensemble import VotingClassifier, BaggingClassifier, RandomForestClassifier
7
+ from sklearn.linear_model import LogisticRegression
8
+ from sklearn.tree import DecisionTreeClassifier
9
+ from sklearn.neighbors import KNeighborsClassifier
10
+ from sklearn.model_selection import train_test_split
11
+ from sklearn.preprocessing import StandardScaler
12
+ from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, precision_score, recall_score, f1_score
13
+
14
+ # Set up Streamlit
15
+ st.set_page_config(page_title="๐Ÿง  Explore Ensemble Learning", layout="wide")
16
+ st.title("๐Ÿง  Ensemble Learning Playground")
17
+
18
+ # ------------------------------------
19
+ # Intro
20
+ # ------------------------------------
21
+ st.markdown("""
22
+ ## ๐Ÿค What is Ensemble Learning?
23
+ Ensemble Learning combines multiple machine learning models to improve overall performance and robustness.
24
+ > โœจ "The wisdom of the crowd" โ€” combining multiple opinions leads to smarter predictions!
25
+ """)
26
+
27
+ with st.expander("๐Ÿ“š Learn More About Ensemble Methods"):
28
+ st.markdown("""
29
+ ### ๐Ÿง  Key Ensemble Methods Explained:
30
+ - **Voting Classifier**: Combines predictions from multiple models (like Logistic Regression, Decision Tree, and KNN).
31
+ - *Hard voting*: Picks the class with the most votes.
32
+ - *Soft voting*: Averages predicted probabilities (requires models that support `predict_proba`).
33
+ - **Bagging (Bootstrap Aggregating)**: Trains the same model (e.g., Decision Tree) on different subsets of data and averages their outputs to reduce overfitting.
34
+ - **Random Forest**: A special type of bagging using multiple decision trees with added randomness for better performance.
35
+ """)
36
+
37
+ # ------------------------------------
38
+ # Load Dataset
39
+ # ------------------------------------
40
+ iris = load_iris()
41
+ df = pd.DataFrame(iris.data, columns=iris.feature_names)
42
+ df["target"] = iris.target
43
+ df["species"] = df["target"].apply(lambda x: iris.target_names[x])
44
+
45
+ # ------------------------------------
46
+ # Dataset Exploration
47
+ # ------------------------------------
48
+ tab1, tab2, tab3 = st.tabs(["๐Ÿ“‹ Dataset", "๐Ÿ“Š Visualizations", "๐Ÿ“ˆ Statistics"])
49
+
50
+ with tab1:
51
+ st.subheader("๐ŸŒผ Iris Dataset Preview")
52
+ st.dataframe(df.head(), use_container_width=True)
53
+
54
+ st.markdown("""
55
+ **Dataset Info:**
56
+ - 150 samples (50 per class)
57
+ - 4 features (sepal length, sepal width, petal length, petal width)
58
+ - 3 target classes (setosa, versicolor, virginica)
59
+ """)
60
+
61
+ with tab2:
62
+ st.subheader("Feature Relationships")
63
+ col1, col2 = st.columns(2)
64
+
65
+ with col1:
66
+ features = st.multiselect("Select two features", iris.feature_names, default=iris.feature_names[:2])
67
+ if len(features) == 2:
68
+ plt.figure(figsize=(8, 5))
69
+ sns.scatterplot(data=df, x=features[0], y=features[1], hue="species", palette="viridis", s=80)
70
+ plt.title(f"{features[0]} vs {features[1]}")
71
+ st.pyplot(plt)
72
+ plt.clf()
73
+
74
+ with col2:
75
+ feature = st.selectbox("Select feature for distribution", iris.feature_names)
76
+ plt.figure(figsize=(8, 5))
77
+ sns.boxplot(data=df, x="species", y=feature, palette="viridis")
78
+ plt.title(f"Distribution of {feature} by species")
79
+ st.pyplot(plt)
80
+ plt.clf()
81
+
82
+ with tab3:
83
+ st.subheader("Dataset Statistics")
84
+ st.dataframe(df.describe(), use_container_width=True)
85
+
86
+ corr = df[iris.feature_names].corr()
87
+ plt.figure(figsize=(8, 6))
88
+ sns.heatmap(corr, annot=True, cmap="coolwarm", center=0)
89
+ plt.title("Feature Correlation Matrix")
90
+ st.pyplot(plt)
91
+ plt.clf()
92
+
93
+ # ------------------------------------
94
+ # Sidebar for Model Selection
95
+ # ------------------------------------
96
+ st.sidebar.header("๐Ÿ”ง Model Configuration")
97
+ ensemble_type = st.sidebar.selectbox("Choose Ensemble Method",
98
+ ["Voting", "Bagging", "Random Forest"],
99
+ help="Select the ensemble learning technique to use")
100
+
101
+ # Common parameters
102
+ test_size = st.sidebar.slider("Test Set Size (%)", 10, 40, 20)
103
+ random_state = st.sidebar.number_input("Random State", 0, 100, 42)
104
+
105
+ # Prepare Data
106
+ X = df[iris.feature_names]
107
+ y = df["target"]
108
+ X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size/100, random_state=random_state)
109
+
110
+ scaler = StandardScaler()
111
+ X_train_scaled = scaler.fit_transform(X_train)
112
+ X_test_scaled = scaler.transform(X_test)
113
+
114
+ # ------------------------------------
115
+ # Model Configuration
116
+ # ------------------------------------
117
+ if ensemble_type == "Voting":
118
+ st.sidebar.subheader("Voting Classifier Settings")
119
+ voting_type = st.sidebar.radio("Voting Type", ["Hard", "Soft"])
120
+ voting = "hard" if voting_type == "Hard" else "soft"
121
+
122
+ # Initialize models
123
+ clf1 = LogisticRegression(random_state=random_state)
124
+ clf2 = DecisionTreeClassifier(random_state=random_state)
125
+ clf3 = KNeighborsClassifier()
126
+
127
+ model = VotingClassifier(estimators=[
128
+ ('lr', clf1),
129
+ ('dt', clf2),
130
+ ('knn', clf3)
131
+ ], voting=voting)
132
+
133
+ elif ensemble_type == "Bagging":
134
+ st.sidebar.subheader("Bagging Settings")
135
+ n_estimators = st.sidebar.slider("Number of Estimators", 1, 100, 10)
136
+ max_samples = st.sidebar.slider("Max Samples per Estimator", 0.1, 1.0, 1.0)
137
+
138
+ base_model = DecisionTreeClassifier(random_state=random_state)
139
+ model = BaggingClassifier(
140
+ estimator=base_model,
141
+ n_estimators=n_estimators,
142
+ max_samples=max_samples,
143
+ random_state=random_state
144
+ )
145
+
146
+ elif ensemble_type == "Random Forest":
147
+ st.sidebar.subheader("Random Forest Settings")
148
+ n_estimators = st.sidebar.slider("Number of Trees", 1, 200, 100)
149
+ max_depth = st.sidebar.slider("Max Depth", 1, 20, None)
150
+ min_samples_split = st.sidebar.slider("Min Samples Split", 2, 10, 2)
151
+
152
+ model = RandomForestClassifier(
153
+ n_estimators=n_estimators,
154
+ max_depth=max_depth,
155
+ min_samples_split=min_samples_split,
156
+ random_state=random_state
157
+ )
158
+
159
+ # ------------------------------------
160
+ # Model Training and Evaluation
161
+ # ------------------------------------
162
+ st.subheader(f"๐Ÿš€ {ensemble_type} Classifier Performance")
163
+
164
+ # Train model
165
+ model.fit(X_train_scaled, y_train)
166
+ y_pred = model.predict(X_test_scaled)
167
+
168
+ # Metrics
169
+ accuracy = accuracy_score(y_test, y_pred)
170
+ precision = precision_score(y_test, y_pred, average='weighted')
171
+ recall = recall_score(y_test, y_pred, average='weighted')
172
+ f1 = f1_score(y_test, y_pred, average='weighted')
173
+
174
+ # Display metrics
175
+ col1, col2, col3, col4 = st.columns(4)
176
+ col1.metric("Accuracy", f"{accuracy:.2%}")
177
+ col2.metric("Precision", f"{precision:.2%}")
178
+ col3.metric("Recall", f"{recall:.2%}")
179
+ col4.metric("F1 Score", f"{f1:.2%}")
180
+
181
+ # Detailed evaluation
182
+ tab_eval1, tab_eval2 = st.tabs(["๐Ÿ“ Classification Report", "๐Ÿ“Š Confusion Matrix"])
183
+
184
+ with tab_eval1:
185
+ st.text(classification_report(y_test, y_pred, target_names=iris.target_names))
186
+
187
+ with tab_eval2:
188
+ cm = confusion_matrix(y_test, y_pred)
189
+ fig, ax = plt.subplots(figsize=(8, 6))
190
+ sns.heatmap(cm, annot=True, fmt="d", cmap="Blues",
191
+ xticklabels=iris.target_names,
192
+ yticklabels=iris.target_names)
193
+ plt.xlabel("Predicted")
194
+ plt.ylabel("Actual")
195
+ plt.title("Confusion Matrix")
196
+ st.pyplot(fig)
197
+
198
+ # Feature importance for Random Forest
199
+ if ensemble_type == "Random Forest":
200
+ st.subheader("๐ŸŒณ Feature Importance")
201
+ feature_importance = model.feature_importances_
202
+ importance_df = pd.DataFrame({
203
+ "Feature": iris.feature_names,
204
+ "Importance": feature_importance
205
+ }).sort_values("Importance", ascending=False)
206
+
207
+ fig, ax = plt.subplots(figsize=(10, 5))
208
+ sns.barplot(data=importance_df, x="Importance", y="Feature", palette="viridis")
209
+ plt.title("Random Forest Feature Importance")
210
+ st.pyplot(fig)
211
+
212
+ # ------------------------------------
213
+ # Prediction Playground
214
+ # ------------------------------------
215
+ st.subheader("๐Ÿ”ฎ Make Your Own Prediction")
216
+
217
+ col1, col2, col3, col4 = st.columns(4)
218
+ with col1:
219
+ sepal_length = st.number_input("Sepal length (cm)", min_value=4.0, max_value=8.0, value=5.1)
220
+ with col2:
221
+ sepal_width = st.number_input("Sepal width (cm)", min_value=2.0, max_value=5.0, value=3.5)
222
+ with col3:
223
+ petal_length = st.number_input("Petal length (cm)", min_value=1.0, max_value=7.0, value=1.4)
224
+ with col4:
225
+ petal_width = st.number_input("Petal width (cm)", min_value=0.1, max_value=2.5, value=0.2)
226
+
227
+ if st.button("Predict Species"):
228
+ input_data = [[sepal_length, sepal_width, petal_length, petal_width]]
229
+ input_scaled = scaler.transform(input_data)
230
+ prediction = model.predict(input_scaled)[0]
231
+ proba = model.predict_proba(input_scaled)[0] if hasattr(model, "predict_proba") else None
232
+
233
+ st.success(f"Predicted Species: **{iris.target_names[prediction]}**")
234
+
235
+ if proba is not None:
236
+ st.write("Prediction Probabilities:")
237
+ proba_df = pd.DataFrame({
238
+ "Species": iris.target_names,
239
+ "Probability": proba
240
+ }).sort_values("Probability", ascending=False)
241
+ st.dataframe(proba_df.style.format({"Probability": "{:.2%}"}), hide_index=True)
242
+
243
+ # ------------------------------------
244
+ # Final Summary
245
+ # ------------------------------------
246
+ st.markdown("""
247
+ ---
248
+ ## ๐Ÿ“Œ Summary
249
+ - **Best Model**: {ensemble_type} with {accuracy:.2%} accuracy
250
+ - **Key Insights**: {insight}
251
+
252
+ > ๐ŸŽฏ Ensemble methods often outperform individual models by reducing variance and bias!
253
+ """.format(
254
+ ensemble_type=ensemble_type,
255
+ accuracy=accuracy,
256
+ insight="Feature importance shows petal measurements are most informative"
257
+ if ensemble_type == "Random Forest"
258
+ else "Combining multiple models leads to more robust predictions"
259
+ ))