Spaces:
Sleeping
Sleeping
Update pages/4_Model Creation and Evaluation.py
Browse files
pages/4_Model Creation and Evaluation.py
CHANGED
|
@@ -71,96 +71,171 @@ st.write("Model training and selection is a crucial phase in machine learning. A
|
|
| 71 |
st.subheader("Data Splitting")
|
| 72 |
st.write("The dataset is divided into training and testing sets. The training set is used to train the model, while the testing set is used to evaluate its performance on unseen data.")
|
| 73 |
|
| 74 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 75 |
if 'X_res' in st.session_state and 'y_res' in st.session_state:
|
| 76 |
-
# Retrieve data from session state
|
| 77 |
X_res = st.session_state['X_res']
|
| 78 |
y_res = st.session_state['y_res']
|
| 79 |
-
|
| 80 |
-
# Perform train-test split
|
| 81 |
x_train, x_test, y_train, y_test = train_test_split(X_res, y_res, test_size=0.2, random_state=42)
|
| 82 |
|
| 83 |
-
|
| 84 |
-
st.
|
| 85 |
-
st.
|
| 86 |
-
st.
|
| 87 |
-
|
| 88 |
-
|
| 89 |
-
st.write("Train-test split completed!")
|
| 90 |
-
|
| 91 |
-
# Debugging: Check data shapes
|
| 92 |
-
st.write(f"x_train shape: {x_train.shape}, x_test shape: {x_test.shape}")
|
| 93 |
-
st.write(f"y_train shape: {y_train.shape}, y_test shape: {y_test.shape}")
|
| 94 |
-
|
| 95 |
-
# Data Preprocessing
|
| 96 |
-
st.subheader("Data Preprocessing")
|
| 97 |
-
st.write("""
|
| 98 |
-
Before training the model, the data may need to be preprocessed. This includes scaling features using techniques like:
|
| 99 |
-
- **StandardScaler**: Standardizes features by removing the mean and scaling to unit variance.
|
| 100 |
-
- **MinMaxScaler**: Scales features to a specific range, typically between 0 and 1.
|
| 101 |
-
""")
|
| 102 |
-
|
| 103 |
-
# Data Scaling
|
| 104 |
-
scaler = StandardScaler()
|
| 105 |
-
x_train_std = scaler.fit_transform(x_train)
|
| 106 |
-
x_test_std = scaler.transform(x_test)
|
| 107 |
-
|
| 108 |
-
# Hyperparameter Tuning with Optuna
|
| 109 |
-
st.subheader("Hyperparameter Tuning with Optuna")
|
| 110 |
-
st.write("""
|
| 111 |
-
Optuna is an automatic hyperparameter optimization framework that allows us to efficiently search for the best hyperparameters for our models. It uses a technique called Bayesian Optimization to find the optimal set of hyperparameters that maximize the model's performance.
|
| 112 |
-
""")
|
| 113 |
-
|
| 114 |
-
# Optuna Objective Function
|
| 115 |
-
def objective(trial):
|
| 116 |
-
# Model Selection
|
| 117 |
-
algo = trial.suggest_categorical("choice", ["KNN", "Logistic"])
|
| 118 |
-
if algo == "KNN":
|
| 119 |
-
n = trial.suggest_int("n_neighbors", 1, 50)
|
| 120 |
-
p = trial.suggest_int("distance", 1, 2)
|
| 121 |
-
model = KNeighborsClassifier(n_neighbors=n, p=p)
|
| 122 |
-
else:
|
| 123 |
-
solver, penalty = trial.suggest_categorical("choices", [("lbfgs", "l2"), ("newton-cg", "l2"), ("sag", "l2"), ("saga", "l1"), ("saga", "l2"), ("saga", "elasticnet")])
|
| 124 |
-
C = trial.suggest_uniform("lambda", 0.01, 1000)
|
| 125 |
-
if penalty == "elasticnet":
|
| 126 |
-
model = LogisticRegression(C=C, solver=solver, penalty=penalty, multi_class="multinomial", l1_ratio=0.3)
|
| 127 |
-
else:
|
| 128 |
-
model = LogisticRegression(C=C, solver=solver, penalty=penalty, multi_class="multinomial")
|
| 129 |
-
|
| 130 |
-
return cross_val_score(model, x_train_std, y_train, cv=5, scoring="neg_log_loss").mean()
|
| 131 |
-
|
| 132 |
-
# Optuna Study
|
| 133 |
-
study = optuna.create_study(direction="maximize", sampler=optuna.samplers.TPESampler())
|
| 134 |
-
study.optimize(objective, n_trials=200)
|
| 135 |
-
|
| 136 |
-
# Display Results
|
| 137 |
-
st.write("### Optuna Results")
|
| 138 |
-
st.write("Best Parameters:", study.best_params)
|
| 139 |
-
st.write(study.trials_dataframe())
|
| 140 |
-
|
| 141 |
-
# Best Parameters
|
| 142 |
-
solver = study.best_params.get('solver', 'lbfgs')
|
| 143 |
-
penalty = study.best_params.get('penalty', 'l2')
|
| 144 |
-
C = study.best_params.get('lambda', 1.0)
|
| 145 |
-
|
| 146 |
-
# Model Training with Best Parameters
|
| 147 |
-
model = LogisticRegression(C=C, solver=solver, penalty=penalty, multi_class="multinomial", max_iter=500)
|
| 148 |
-
model.fit(x_train_std, y_train)
|
| 149 |
-
|
| 150 |
-
# Evaluation Metrics
|
| 151 |
-
y_pred_probs = model.predict_proba(x_test_std)
|
| 152 |
-
loss = log_loss(y_test, y_pred_probs)
|
| 153 |
-
y_pred = np.argmax(y_pred_probs, axis=1)
|
| 154 |
|
| 155 |
-
|
| 156 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 157 |
|
| 158 |
-
|
| 159 |
-
|
| 160 |
-
|
| 161 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 162 |
|
| 163 |
-
else:
|
| 164 |
-
st.error("Training and testing data are not available. Please run the previous steps first.")
|
| 165 |
-
|
| 166 |
|
|
|
|
| 71 |
st.subheader("Data Splitting")
|
| 72 |
st.write("The dataset is divided into training and testing sets. The training set is used to train the model, while the testing set is used to evaluate its performance on unseen data.")
|
| 73 |
|
| 74 |
+
# Page Title
|
| 75 |
+
st.markdown("<h1 style='text-align:center; color:purple;'>Model Creation and Evaluation</h1>", unsafe_allow_html=True)
|
| 76 |
+
|
| 77 |
+
# Code and Output 1: Data Splitting
|
| 78 |
+
st.subheader("Step 1: Data Splitting")
|
| 79 |
+
|
| 80 |
+
# Code for Data Splitting
|
| 81 |
+
code_1 = """
|
| 82 |
+
# Data Splitting
|
| 83 |
+
x_train, x_test, y_train, y_test = train_test_split(X_res, y_res, test_size=0.2, random_state=42)
|
| 84 |
+
"""
|
| 85 |
+
st.code(code_1, language='python')
|
| 86 |
+
|
| 87 |
+
# Output for Data Splitting
|
| 88 |
if 'X_res' in st.session_state and 'y_res' in st.session_state:
|
|
|
|
| 89 |
X_res = st.session_state['X_res']
|
| 90 |
y_res = st.session_state['y_res']
|
|
|
|
|
|
|
| 91 |
x_train, x_test, y_train, y_test = train_test_split(X_res, y_res, test_size=0.2, random_state=42)
|
| 92 |
|
| 93 |
+
st.write(f"x_train shape: {x_train.shape}")
|
| 94 |
+
st.write(f"x_test shape: {x_test.shape}")
|
| 95 |
+
st.write(f"y_train shape: {y_train.shape}")
|
| 96 |
+
st.write(f"y_test shape: {y_test.shape}")
|
| 97 |
+
else:
|
| 98 |
+
st.error("Training and testing data are not available. Please run the previous steps first.")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 99 |
|
| 100 |
+
# Code and Output 2: Data Scaling
|
| 101 |
+
st.subheader("Step 2: Data Scaling")
|
| 102 |
+
|
| 103 |
+
# Code for Data Scaling
|
| 104 |
+
code_2 = """
|
| 105 |
+
# Data Scaling
|
| 106 |
+
scaler = StandardScaler()
|
| 107 |
+
x_train_std = scaler.fit_transform(x_train)
|
| 108 |
+
x_test_std = scaler.transform(x_test)
|
| 109 |
+
"""
|
| 110 |
+
st.code(code_2, language='python')
|
| 111 |
+
|
| 112 |
+
# Output for Data Scaling
|
| 113 |
+
scaler = StandardScaler()
|
| 114 |
+
x_train_std = scaler.fit_transform(x_train)
|
| 115 |
+
x_test_std = scaler.transform(x_test)
|
| 116 |
+
|
| 117 |
+
st.write(f"Scaled x_train_std shape: {x_train_std.shape}")
|
| 118 |
+
st.write(f"Scaled x_test_std shape: {x_test_std.shape}")
|
| 119 |
+
|
| 120 |
+
# Code and Output 3: Optuna Optimization
|
| 121 |
+
st.subheader("Step 3: Hyperparameter Optimization with Optuna")
|
| 122 |
+
|
| 123 |
+
# Code for Optuna
|
| 124 |
+
code_3 = """
|
| 125 |
+
# Optuna Objective Function
|
| 126 |
+
def objective(trial):
|
| 127 |
+
algo = trial.suggest_categorical("choice", ["KNN", "Logistic"])
|
| 128 |
+
if algo == "KNN":
|
| 129 |
+
n = trial.suggest_int("n_neighbors", 1, 50)
|
| 130 |
+
p = trial.suggest_int("distance", 1, 2)
|
| 131 |
+
model = KNeighborsClassifier(n_neighbors=n, p=p)
|
| 132 |
+
else:
|
| 133 |
+
solver, penalty = trial.suggest_categorical("choices", [("lbfgs", "l2"), ("newton-cg", "l2")])
|
| 134 |
+
C = trial.suggest_uniform("lambda", 0.01, 1000)
|
| 135 |
+
model = LogisticRegression(C=C, solver=solver, penalty=penalty)
|
| 136 |
+
return cross_val_score(model, x_train_std, y_train, cv=5, scoring="neg_log_loss").mean()
|
| 137 |
+
|
| 138 |
+
study = optuna.create_study(direction="maximize", sampler=optuna.samplers.TPESampler())
|
| 139 |
+
study.optimize(objective, n_trials=200)
|
| 140 |
+
"""
|
| 141 |
+
st.code(code_3, language='python')
|
| 142 |
+
|
| 143 |
+
# Output for Optuna Optimization
|
| 144 |
+
study = optuna.create_study(direction="maximize", sampler=optuna.samplers.TPESampler())
|
| 145 |
+
study.optimize(objective, n_trials=200)
|
| 146 |
+
|
| 147 |
+
st.write("Best Parameters found by Optuna:", study.best_params)
|
| 148 |
+
st.write("All Trials Dataframe:")
|
| 149 |
+
st.write(study.trials_dataframe())
|
| 150 |
+
|
| 151 |
+
# Code and Output 4: Model Training with Best Parameters
|
| 152 |
+
st.subheader("Step 4: Model Training with Best Parameters")
|
| 153 |
+
|
| 154 |
+
# Code for Model Training
|
| 155 |
+
code_4 = """
|
| 156 |
+
# Model Training with Best Parameters
|
| 157 |
+
solver = study.best_params.get('solver', 'lbfgs')
|
| 158 |
+
penalty = study.best_params.get('penalty', 'l2')
|
| 159 |
+
C = study.best_params.get('lambda', 1.0)
|
| 160 |
+
model = LogisticRegression(C=C, solver=solver, penalty=penalty, max_iter=500)
|
| 161 |
+
model.fit(x_train_std, y_train)
|
| 162 |
+
"""
|
| 163 |
+
st.code(code_4, language='python')
|
| 164 |
+
|
| 165 |
+
# Output for Model Training
|
| 166 |
+
solver = study.best_params.get('solver', 'lbfgs')
|
| 167 |
+
penalty = study.best_params.get('penalty', 'l2')
|
| 168 |
+
C = study.best_params.get('lambda', 1.0)
|
| 169 |
+
model = LogisticRegression(C=C, solver=solver, penalty=penalty, max_iter=500)
|
| 170 |
+
model.fit(x_train_std, y_train)
|
| 171 |
+
|
| 172 |
+
st.write("Model has been trained successfully!")
|
| 173 |
+
|
| 174 |
+
# Code and Output 5: Model Evaluation
|
| 175 |
+
st.subheader("Step 5: Model Evaluation")
|
| 176 |
+
|
| 177 |
+
# Code for Model Evaluation
|
| 178 |
+
code_5 = """
|
| 179 |
+
# Model Evaluation
|
| 180 |
+
y_pred_probs = model.predict_proba(x_test_std)
|
| 181 |
+
loss = log_loss(y_test, y_pred_probs)
|
| 182 |
+
y_pred = np.argmax(y_pred_probs, axis=1)
|
| 183 |
+
cm = confusion_matrix(y_test, y_pred)
|
| 184 |
+
"""
|
| 185 |
+
st.code(code_5, language='python')
|
| 186 |
+
|
| 187 |
+
# Output for Model Evaluation
|
| 188 |
+
y_pred_probs = model.predict_proba(x_test_std)
|
| 189 |
+
loss = log_loss(y_test, y_pred_probs)
|
| 190 |
+
y_pred = np.argmax(y_pred_probs, axis=1)
|
| 191 |
+
cm = confusion_matrix(y_test, y_pred)
|
| 192 |
+
|
| 193 |
+
st.write(f"Log-Loss Score: {loss}")
|
| 194 |
+
st.write("Confusion Matrix:")
|
| 195 |
+
st.write(cm)
|
| 196 |
+
|
| 197 |
+
import streamlit as st
|
| 198 |
+
|
| 199 |
+
# Custom CSS for buttons
|
| 200 |
+
st.markdown(
|
| 201 |
+
"""
|
| 202 |
+
<style>
|
| 203 |
+
.custom-button {
|
| 204 |
+
display: inline-block;
|
| 205 |
+
padding: 5px 10px;
|
| 206 |
+
font-size: 14px;
|
| 207 |
+
color: #ffffff;
|
| 208 |
+
background-color: #4CAF50;
|
| 209 |
+
border: none;
|
| 210 |
+
border-radius: 5px;
|
| 211 |
+
text-align: center;
|
| 212 |
+
text-decoration: none;
|
| 213 |
+
transition: background-color 0.3s ease, transform 0.2s ease;
|
| 214 |
+
cursor: pointer;
|
| 215 |
+
}
|
| 216 |
+
.custom-button:hover {
|
| 217 |
+
background-color: #45a049;
|
| 218 |
+
transform: scale(1.05);
|
| 219 |
+
}
|
| 220 |
+
.button-container {
|
| 221 |
+
display: flex;
|
| 222 |
+
justify-content: space-between;
|
| 223 |
+
margin-top: 20px;
|
| 224 |
+
}
|
| 225 |
+
</style>
|
| 226 |
+
""",
|
| 227 |
+
unsafe_allow_html=True,
|
| 228 |
+
)
|
| 229 |
|
| 230 |
+
# Navigation Buttons
|
| 231 |
+
st.markdown(
|
| 232 |
+
"""
|
| 233 |
+
<div class="button-container">
|
| 234 |
+
<a href="pages/3_EDA_and_Feature_Engineering" target="_self" class="custom-button">Previous ⏮️</a>
|
| 235 |
+
<a href="pages/5_Conclusion" target="_self" class="custom-button">Next ⏭️</a>
|
| 236 |
+
</div>
|
| 237 |
+
""",
|
| 238 |
+
unsafe_allow_html=True,
|
| 239 |
+
)
|
| 240 |
|
|
|
|
|
|
|
|
|
|
| 241 |
|