Update pages/Model Creation with Optuna.py
Browse files- pages/Model Creation with Optuna.py +167 -49
pages/Model Creation with Optuna.py
CHANGED
|
@@ -1,58 +1,176 @@
|
|
| 1 |
import streamlit as st
|
| 2 |
import pandas as pd
|
| 3 |
-
|
| 4 |
-
|
| 5 |
-
|
| 6 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 7 |
|
| 8 |
-
#
|
| 9 |
-
st.
|
| 10 |
-
st.markdown("""
|
| 11 |
-
Upload your dataset, select features and target, and let Optuna optimize hyperparameters
|
| 12 |
-
to train the best Random Forest model.
|
| 13 |
-
""")
|
| 14 |
-
|
| 15 |
-
# File uploader
|
| 16 |
-
uploaded_file = st.file_uploader("Upload your prepared dataset (CSV format):", type=["csv"])
|
| 17 |
|
| 18 |
if uploaded_file is not None:
|
|
|
|
| 19 |
data = pd.read_csv(uploaded_file)
|
| 20 |
-
st.write("### Dataset:")
|
| 21 |
st.dataframe(data)
|
| 22 |
|
| 23 |
-
#
|
| 24 |
-
|
| 25 |
-
|
| 26 |
-
|
| 27 |
-
|
| 28 |
-
|
| 29 |
-
|
| 30 |
-
|
| 31 |
-
|
| 32 |
-
|
| 33 |
-
|
| 34 |
-
|
| 35 |
-
|
| 36 |
-
|
| 37 |
-
|
| 38 |
-
|
| 39 |
-
|
| 40 |
-
|
| 41 |
-
|
| 42 |
-
|
| 43 |
-
|
| 44 |
-
|
| 45 |
-
|
| 46 |
-
|
| 47 |
-
|
| 48 |
-
|
| 49 |
-
|
| 50 |
-
|
| 51 |
-
|
| 52 |
-
|
| 53 |
-
|
| 54 |
-
|
| 55 |
-
|
| 56 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 57 |
else:
|
| 58 |
-
st.warning("
|
|
|
|
| 1 |
import streamlit as st
|
| 2 |
import pandas as pd
|
| 3 |
+
import numpy as np
|
| 4 |
+
from sklearn.model_selection import train_test_split, cross_val_score
|
| 5 |
+
from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder
|
| 6 |
+
from sklearn.ensemble import RandomForestClassifier
|
| 7 |
+
from sklearn.linear_model import LogisticRegression
|
| 8 |
+
from sklearn.svm import SVC
|
| 9 |
+
from sklearn.metrics import accuracy_score, classification_report
|
| 10 |
+
from imblearn.over_sampling import SMOTE
|
| 11 |
+
import optuna
|
| 12 |
+
from sklearn.neighbors import KNeighborsClassifier
|
| 13 |
|
| 14 |
+
# File uploader for dataset
|
| 15 |
+
uploaded_file = st.file_uploader("Upload your dataset (CSV format):", type=["csv"])
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 16 |
|
| 17 |
if uploaded_file is not None:
|
| 18 |
+
# Read and display the dataset
|
| 19 |
data = pd.read_csv(uploaded_file)
|
| 20 |
+
st.write("### Uploaded Dataset:")
|
| 21 |
st.dataframe(data)
|
| 22 |
|
| 23 |
+
# Dataset Overview
|
| 24 |
+
st.write("### Dataset Overview:")
|
| 25 |
+
st.write(data.describe())
|
| 26 |
+
|
| 27 |
+
# Missing values in the dataset
|
| 28 |
+
st.write("### Missing Values:")
|
| 29 |
+
st.write(data.isnull().sum())
|
| 30 |
+
|
| 31 |
+
# Select target column for classification
|
| 32 |
+
target_column = st.selectbox("Select target column", data.columns)
|
| 33 |
+
|
| 34 |
+
# Handle Encoding
|
| 35 |
+
encoding_method = st.selectbox("Select Encoding Method", ["None", "LabelEncoding", "OneHotEncoding"])
|
| 36 |
+
if encoding_method == "LabelEncoding":
|
| 37 |
+
label_encoder = LabelEncoder()
|
| 38 |
+
data = data.apply(lambda col: label_encoder.fit_transform(col) if col.dtype == 'object' else col)
|
| 39 |
+
st.write("Applied Label Encoding to categorical variables.")
|
| 40 |
+
elif encoding_method == "OneHotEncoding":
|
| 41 |
+
categorical_columns = data.select_dtypes(include=['object']).columns
|
| 42 |
+
data = pd.get_dummies(data, columns=categorical_columns)
|
| 43 |
+
st.write("Applied One-Hot Encoding to categorical variables.")
|
| 44 |
+
|
| 45 |
+
# Class imbalance check and handling with SMOTE
|
| 46 |
+
y = data[target_column]
|
| 47 |
+
X = data.drop(columns=[target_column])
|
| 48 |
+
value_counts = y.value_counts()
|
| 49 |
+
st.write(f"Class distribution in {target_column}:")
|
| 50 |
+
st.write(value_counts)
|
| 51 |
+
if value_counts.min() / value_counts.max() < 0.25:
|
| 52 |
+
smote = SMOTE(random_state=42)
|
| 53 |
+
X, y = smote.fit_resample(X, y)
|
| 54 |
+
st.write("Applied SMOTE for balancing classes.")
|
| 55 |
+
|
| 56 |
+
# Scaling
|
| 57 |
+
scaling_method = st.selectbox("Select Scaling Method", ["None", "StandardScaler", "MinMaxScaler"])
|
| 58 |
+
if scaling_method == "StandardScaler":
|
| 59 |
+
scaler = StandardScaler()
|
| 60 |
+
X_scaled = scaler.fit_transform(X)
|
| 61 |
+
elif scaling_method == "MinMaxScaler":
|
| 62 |
+
scaler = MinMaxScaler()
|
| 63 |
+
X_scaled = scaler.fit_transform(X)
|
| 64 |
+
else:
|
| 65 |
+
X_scaled = X # No scaling if selected as "None"
|
| 66 |
+
|
| 67 |
+
# Splitting data into training and testing sets
|
| 68 |
+
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.3, random_state=42)
|
| 69 |
+
|
| 70 |
+
# Model Selection options
|
| 71 |
+
algorithms = st.multiselect("Select algorithms", ["RandomForest", "LogisticRegression", "SVC", "KNN"])
|
| 72 |
+
|
| 73 |
+
# Metric selection
|
| 74 |
+
metrics = st.multiselect("Select evaluation metrics", ["Accuracy", "Precision", "Recall", "F1-score"])
|
| 75 |
+
|
| 76 |
+
# **Theory: Model Training and Selection with Optuna**
|
| 77 |
+
# Model training and selection is a crucial phase in machine learning. After completing the exploratory data analysis (EDA),
|
| 78 |
+
# the next step is to build and optimize predictive models. This section focuses on the following key aspects:
|
| 79 |
+
|
| 80 |
+
# **Data Splitting**: The dataset is divided into training and testing sets. The training set is used to train the model,
|
| 81 |
+
# while the testing set is used to evaluate its performance on unseen data.
|
| 82 |
+
|
| 83 |
+
# **Model Selection**: Various machine learning algorithms can be used for solving the problem. In this section, we will consider:
|
| 84 |
+
# - Logistic Regression: A statistical model commonly used for binary classification tasks.
|
| 85 |
+
# - K-Nearest Neighbors (KNN): A non-parametric algorithm used for classification based on distance metrics.
|
| 86 |
+
|
| 87 |
+
# **Data Preprocessing**: Before training the model, the data may need to be preprocessed. This includes scaling features using techniques like:
|
| 88 |
+
# - StandardScaler: Standardizes features by removing the mean and scaling to unit variance.
|
| 89 |
+
# - MinMaxScaler: Scales features to a specific range, typically between 0 and 1.
|
| 90 |
+
|
| 91 |
+
# **Hyperparameter Tuning with Optuna**: Optuna is an automatic hyperparameter optimization framework that allows us to efficiently
|
| 92 |
+
# search for the best hyperparameters for our models. It uses a technique called Bayesian Optimization to find the optimal set of hyperparameters
|
| 93 |
+
# that maximize the model's performance.
|
| 94 |
+
|
| 95 |
+
# **Model Evaluation**: After the model is trained and optimized, its performance is evaluated using appropriate metrics, such as accuracy, precision, recall, F1-score, etc.
|
| 96 |
+
|
| 97 |
+
# This section focuses on using Optuna for hyperparameter tuning, ensuring the model performs optimally before deployment.
|
| 98 |
+
|
| 99 |
+
# Optuna hyperparameter tuning function
|
| 100 |
+
def objective(trial):
|
| 101 |
+
# Select model type
|
| 102 |
+
model_type = trial.suggest_categorical("model", algorithms)
|
| 103 |
+
|
| 104 |
+
if model_type == "KNN":
|
| 105 |
+
n_neighbors = trial.suggest_int("n_neighbors", 1, 100)
|
| 106 |
+
p = trial.suggest_int("p", 1, 2)
|
| 107 |
+
model = KNeighborsClassifier(n_neighbors=n_neighbors, p=p)
|
| 108 |
+
|
| 109 |
+
elif model_type == "LogisticRegression":
|
| 110 |
+
solver, penalty = trial.suggest_categorical("solver_penalty", [
|
| 111 |
+
("lbfgs", "l2"), ("newton-cg", "l2"), ("sag", "l2"), ("saga", "l1"),
|
| 112 |
+
("saga", "l2"), ("saga", "elasticnet")])
|
| 113 |
+
C = trial.suggest_loguniform("C", 1e-5, 1e2)
|
| 114 |
+
if penalty == "elasticnet":
|
| 115 |
+
model = LogisticRegression(C=C, solver=solver, penalty=penalty, multi_class="multinomial", l1_ratio=0.3)
|
| 116 |
+
else:
|
| 117 |
+
model = LogisticRegression(C=C, solver=solver, penalty=penalty, multi_class="multinomial")
|
| 118 |
+
|
| 119 |
+
elif model_type == "RandomForest":
|
| 120 |
+
n_estimators = trial.suggest_int("n_estimators", 50, 200)
|
| 121 |
+
max_depth = trial.suggest_int("max_depth", 3, 10)
|
| 122 |
+
model = RandomForestClassifier(n_estimators=n_estimators, max_depth=max_depth, random_state=42)
|
| 123 |
+
|
| 124 |
+
elif model_type == "SVC":
|
| 125 |
+
C = trial.suggest_loguniform("C", 1e-5, 1e2)
|
| 126 |
+
kernel = trial.suggest_categorical("kernel", ["linear", "rbf"])
|
| 127 |
+
model = SVC(C=C, kernel=kernel, random_state=42)
|
| 128 |
+
|
| 129 |
+
# Cross-validation score
|
| 130 |
+
score = cross_val_score(model, X_train, y_train, cv=5, scoring="accuracy").mean()
|
| 131 |
+
return score
|
| 132 |
+
|
| 133 |
+
# Run Optuna optimization
|
| 134 |
+
if st.button("Start Hyperparameter Tuning"):
|
| 135 |
+
study = optuna.create_study(direction="maximize")
|
| 136 |
+
study.optimize(objective, n_trials=100)
|
| 137 |
+
st.write(f"Best trial: {study.best_trial.params}")
|
| 138 |
+
st.write(f"Best score: {study.best_trial.value}")
|
| 139 |
+
|
| 140 |
+
# Select best model and evaluate
|
| 141 |
+
best_model_type = study.best_trial.params['model']
|
| 142 |
+
if best_model_type == "KNN":
|
| 143 |
+
model = KNeighborsClassifier(n_neighbors=study.best_trial.params['n_neighbors'], p=study.best_trial.params['p'])
|
| 144 |
+
elif best_model_type == "LogisticRegression":
|
| 145 |
+
model = LogisticRegression(C=study.best_trial.params['C'], solver=study.best_trial.params['solver_penalty'][0],
|
| 146 |
+
penalty=study.best_trial.params['solver_penalty'][1], multi_class="multinomial")
|
| 147 |
+
elif best_model_type == "RandomForest":
|
| 148 |
+
model = RandomForestClassifier(n_estimators=study.best_trial.params['n_estimators'],
|
| 149 |
+
max_depth=study.best_trial.params['max_depth'], random_state=42)
|
| 150 |
+
elif best_model_type == "SVC":
|
| 151 |
+
model = SVC(C=study.best_trial.params['C'], kernel=study.best_trial.params['kernel'], random_state=42)
|
| 152 |
+
|
| 153 |
+
# Model training
|
| 154 |
+
model.fit(X_train, y_train)
|
| 155 |
+
y_pred = model.predict(X_test)
|
| 156 |
+
|
| 157 |
+
# Evaluation
|
| 158 |
+
st.write("### Model Evaluation:")
|
| 159 |
+
if "Accuracy" in metrics:
|
| 160 |
+
accuracy = accuracy_score(y_test, y_pred)
|
| 161 |
+
st.write(f"Accuracy: {accuracy}")
|
| 162 |
+
if "Precision" in metrics:
|
| 163 |
+
precision = precision_score(y_test, y_pred, average='weighted')
|
| 164 |
+
st.write(f"Precision: {precision}")
|
| 165 |
+
if "Recall" in metrics:
|
| 166 |
+
recall = recall_score(y_test, y_pred, average='weighted')
|
| 167 |
+
st.write(f"Recall: {recall}")
|
| 168 |
+
if "F1-score" in metrics:
|
| 169 |
+
f1 = f1_score(y_test, y_pred, average='weighted')
|
| 170 |
+
st.write(f"F1-score: {f1}")
|
| 171 |
+
|
| 172 |
+
# Display classification report
|
| 173 |
+
st.write("### Classification Report:")
|
| 174 |
+
st.write(classification_report(y_test, y_pred))
|
| 175 |
else:
|
| 176 |
+
st.warning("Please upload a dataset to proceed with EDA.")
|