Update pages/4_Model_Creation_and_Evaluation.py
Browse files
pages/4_Model_Creation_and_Evaluation.py
CHANGED
|
@@ -104,15 +104,33 @@ if df is not None:
|
|
| 104 |
|
| 105 |
# Define the objective function for Optuna
|
| 106 |
st.code("""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 107 |
def objective(trial):
|
| 108 |
-
#
|
| 109 |
-
|
| 110 |
|
|
|
|
|
|
|
| 111 |
if algo == "svc":
|
| 112 |
-
# SVC
|
| 113 |
c = trial.suggest_float("C", 0.001, 1000, log=True)
|
| 114 |
kernel = trial.suggest_categorical("kernel", ['linear', 'poly', 'rbf', 'sigmoid'])
|
| 115 |
-
|
| 116 |
if kernel == 'poly':
|
| 117 |
degree = trial.suggest_int("degree", 1, 3)
|
| 118 |
model = SVC(C=c, kernel=kernel, degree=degree, random_state=42)
|
|
@@ -122,7 +140,7 @@ if df is not None:
|
|
| 122 |
else:
|
| 123 |
model = SVC(C=c, kernel=kernel, random_state=42)
|
| 124 |
else:
|
| 125 |
-
# Logistic Regression
|
| 126 |
solver, penalty = trial.suggest_categorical(
|
| 127 |
"choices", [
|
| 128 |
("lbfgs", "l2"), ("newton-cg", "l2"),
|
|
@@ -132,22 +150,62 @@ if df is not None:
|
|
| 132 |
)
|
| 133 |
reg_strength = trial.suggest_float("C", 0.001, 1000, log=True)
|
| 134 |
l1_ratio = trial.suggest_float("l1_ratio", 0, 1) if penalty == "elasticnet" else None
|
| 135 |
-
|
| 136 |
if penalty == "elasticnet":
|
| 137 |
-
model = LogisticRegression(
|
| 138 |
-
solver=solver, penalty=penalty, C=reg_strength,
|
| 139 |
-
l1_ratio=l1_ratio, random_state=42
|
| 140 |
-
)
|
| 141 |
else:
|
| 142 |
-
model = LogisticRegression(
|
| 143 |
-
|
| 144 |
-
|
| 145 |
-
|
| 146 |
-
|
| 147 |
-
|
| 148 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 149 |
""", language="python")
|
| 150 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 151 |
# Create and optimize the study
|
| 152 |
st.code("""
|
| 153 |
study = optuna.create_study(direction="maximize")
|
|
@@ -158,7 +216,7 @@ if df is not None:
|
|
| 158 |
|
| 159 |
# Create the best model
|
| 160 |
st.markdown("## Create the Best Model")
|
| 161 |
-
model = SVC(kernel='
|
| 162 |
st.write(model)
|
| 163 |
|
| 164 |
# Train the model
|
|
@@ -168,9 +226,57 @@ if df is not None:
|
|
| 168 |
# Model Evaluation
|
| 169 |
st.markdown("# Model Evaluation")
|
| 170 |
y_pred = model.predict(x_test_std)
|
| 171 |
-
|
| 172 |
-
|
| 173 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 174 |
|
| 175 |
else:
|
| 176 |
st.warning("No Dataset Found")
|
|
|
|
| 104 |
|
| 105 |
# Define the objective function for Optuna
|
| 106 |
st.code("""
|
| 107 |
+
import numpy as np
|
| 108 |
+
import optuna
|
| 109 |
+
from sklearn.svm import SVC
|
| 110 |
+
from sklearn.linear_model import LogisticRegression
|
| 111 |
+
from sklearn.model_selection import cross_validate
|
| 112 |
+
from sklearn.preprocessing import StandardScaler
|
| 113 |
+
|
| 114 |
+
# Check for NaN or infinite values in the data
|
| 115 |
+
assert not np.any(np.isnan(x_train_std)), "Input data contains NaN values"
|
| 116 |
+
assert not np.any(np.isnan(y_train)), "Target data contains NaN values"
|
| 117 |
+
assert not np.any(np.isinf(x_train_std)), "Input data contains infinite values"
|
| 118 |
+
|
| 119 |
+
# Global lists to store training and validation scores for each trial
|
| 120 |
+
training_scores = []
|
| 121 |
+
validation_scores = []
|
| 122 |
+
|
| 123 |
def objective(trial):
|
| 124 |
+
# Log trial parameters for debugging
|
| 125 |
+
print(f"Trial params: {trial.params}")
|
| 126 |
|
| 127 |
+
algo = trial.suggest_categorical("algo", ["lor", "svc"])
|
| 128 |
+
|
| 129 |
if algo == "svc":
|
| 130 |
+
# Hyperparameters for SVC
|
| 131 |
c = trial.suggest_float("C", 0.001, 1000, log=True)
|
| 132 |
kernel = trial.suggest_categorical("kernel", ['linear', 'poly', 'rbf', 'sigmoid'])
|
| 133 |
+
|
| 134 |
if kernel == 'poly':
|
| 135 |
degree = trial.suggest_int("degree", 1, 3)
|
| 136 |
model = SVC(C=c, kernel=kernel, degree=degree, random_state=42)
|
|
|
|
| 140 |
else:
|
| 141 |
model = SVC(C=c, kernel=kernel, random_state=42)
|
| 142 |
else:
|
| 143 |
+
# Hyperparameters for Logistic Regression
|
| 144 |
solver, penalty = trial.suggest_categorical(
|
| 145 |
"choices", [
|
| 146 |
("lbfgs", "l2"), ("newton-cg", "l2"),
|
|
|
|
| 150 |
)
|
| 151 |
reg_strength = trial.suggest_float("C", 0.001, 1000, log=True)
|
| 152 |
l1_ratio = trial.suggest_float("l1_ratio", 0, 1) if penalty == "elasticnet" else None
|
| 153 |
+
|
| 154 |
if penalty == "elasticnet":
|
| 155 |
+
model = LogisticRegression(solver=solver, penalty=penalty, C=reg_strength, l1_ratio=l1_ratio, random_state=42)
|
|
|
|
|
|
|
|
|
|
| 156 |
else:
|
| 157 |
+
model = LogisticRegression(solver=solver, penalty=penalty, C=reg_strength, random_state=42)
|
| 158 |
+
|
| 159 |
+
# Cross-validation scoring with training and validation
|
| 160 |
+
try:
|
| 161 |
+
scores = cross_validate(
|
| 162 |
+
model, x_train_std, y_train, cv=5,
|
| 163 |
+
scoring="accuracy", return_train_score=True
|
| 164 |
+
)
|
| 165 |
+
train_score = scores["train_score"].mean()
|
| 166 |
+
val_score = scores["test_score"].mean()
|
| 167 |
+
|
| 168 |
+
# Append scores to global lists
|
| 169 |
+
training_scores.append(train_score)
|
| 170 |
+
validation_scores.append(val_score)
|
| 171 |
+
except ValueError as e:
|
| 172 |
+
print(f"Error during cross-validation: {e}")
|
| 173 |
+
train_score, val_score = float("-inf"), float("-inf")
|
| 174 |
+
|
| 175 |
+
return val_score
|
| 176 |
+
|
| 177 |
+
# Running the optimization
|
| 178 |
+
study = optuna.create_study(direction="maximize", sampler=optuna.samplers.TPESampler())
|
| 179 |
+
study.optimize(objective, n_trials=100)
|
| 180 |
+
|
| 181 |
+
# Plotting training vs. validation scores
|
| 182 |
+
import matplotlib.pyplot as plt
|
| 183 |
+
|
| 184 |
+
plt.figure(figsize=(10, 6))
|
| 185 |
+
plt.plot(training_scores, label="Training Score", marker="o")
|
| 186 |
+
plt.plot(validation_scores, label="Validation Score", marker="x")
|
| 187 |
+
plt.xlabel("Trial")
|
| 188 |
+
plt.ylabel("Accuracy")
|
| 189 |
+
plt.title("Training vs. Validation Scores Across Trials")
|
| 190 |
+
plt.legend()
|
| 191 |
+
plt.grid()
|
| 192 |
+
plt.show()
|
| 193 |
+
|
| 194 |
+
# Display best trial
|
| 195 |
+
print("Best Parameters:")
|
| 196 |
+
print(study.best_params)
|
| 197 |
+
|
| 198 |
""", language="python")
|
| 199 |
|
| 200 |
+
st.markdown(
|
| 201 |
+
"""
|
| 202 |
+
<div style="text-align: center;">
|
| 203 |
+
<img src="https://cdn-uploads.huggingface.co/production/uploads/67441c51a784a9d15cb12871/FqUoV8hSyCWU3WocaqqGc.png" width="70%" />
|
| 204 |
+
</div>
|
| 205 |
+
""",
|
| 206 |
+
unsafe_allow_html=True
|
| 207 |
+
)
|
| 208 |
+
|
| 209 |
# Create and optimize the study
|
| 210 |
st.code("""
|
| 211 |
study = optuna.create_study(direction="maximize")
|
|
|
|
| 216 |
|
| 217 |
# Create the best model
|
| 218 |
st.markdown("## Create the Best Model")
|
| 219 |
+
model = SVC(kernel='poly', gamma = 'scale', C = 974.1963187644974, degree = 2)
|
| 220 |
st.write(model)
|
| 221 |
|
| 222 |
# Train the model
|
|
|
|
| 226 |
# Model Evaluation
|
| 227 |
st.markdown("# Model Evaluation")
|
| 228 |
y_pred = model.predict(x_test_std)
|
| 229 |
+
|
| 230 |
+
# Evaluation metrics
|
| 231 |
+
print("Accuracy:", accuracy_score(y_test, y_pred))
|
| 232 |
+
print("Classification Report:\n", classification_report(y_test, y_pred))
|
| 233 |
+
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
|
| 234 |
+
|
| 235 |
+
import streamlit as st
|
| 236 |
+
import pandas as pd
|
| 237 |
+
import seaborn as sns
|
| 238 |
+
import matplotlib.pyplot as plt
|
| 239 |
+
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
|
| 240 |
+
|
| 241 |
+
# Example: Replace this with your actual test data and predictions
|
| 242 |
+
y_pred = model.predict(x_test_std)
|
| 243 |
+
|
| 244 |
+
# Calculate evaluation metrics
|
| 245 |
+
conf_matrix = confusion_matrix(y_test, y_pred)
|
| 246 |
+
class_report = classification_report(y_test, y_pred, output_dict=True) # Output as a dictionary
|
| 247 |
+
|
| 248 |
+
# Convert the classification report to a DataFrame
|
| 249 |
+
class_report_df = pd.DataFrame(class_report).iloc[:-1, :-1] # Exclude support and accuracy rows
|
| 250 |
+
|
| 251 |
+
# Streamlit app
|
| 252 |
+
st.title("Model Evaluation: Confusion Matrix and Classification Report")
|
| 253 |
+
|
| 254 |
+
# Plotting with Matplotlib and Seaborn
|
| 255 |
+
fig, axs = plt.subplots(1, 2, figsize=(16, 6))
|
| 256 |
+
|
| 257 |
+
# Confusion Matrix Heatmap
|
| 258 |
+
sns.heatmap(conf_matrix, annot=True, fmt="d", cmap="Blues", cbar=False, ax=axs[0], annot_kws={"size": 14})
|
| 259 |
+
axs[0].set_title("Confusion Matrix", fontsize=16)
|
| 260 |
+
axs[0].set_xlabel("Predicted Labels", fontsize=14)
|
| 261 |
+
axs[0].set_ylabel("True Labels", fontsize=14)
|
| 262 |
+
|
| 263 |
+
# Classification Report Heatmap
|
| 264 |
+
sns.heatmap(class_report_df, annot=True, fmt=".2f", cmap="YlGnBu", cbar=False, ax=axs[1], annot_kws={"size": 12})
|
| 265 |
+
axs[1].set_title("Classification Report", fontsize=16)
|
| 266 |
+
axs[1].set_xlabel("Metrics", fontsize=14)
|
| 267 |
+
axs[1].set_ylabel("Classes", fontsize=14)
|
| 268 |
+
|
| 269 |
+
# Adjust layout
|
| 270 |
+
plt.tight_layout()
|
| 271 |
+
|
| 272 |
+
# Display the plots in Streamlit
|
| 273 |
+
st.pyplot(fig)
|
| 274 |
+
|
| 275 |
+
# Display additional metrics (optional)
|
| 276 |
+
accuracy = accuracy_score(y_test, y_pred)
|
| 277 |
+
st.write(f"**Accuracy:** {accuracy:.2f}")
|
| 278 |
+
|
| 279 |
+
|
| 280 |
|
| 281 |
else:
|
| 282 |
st.warning("No Dataset Found")
|