Model / pages /Model Creation with Optuna.py
Mpavan45's picture
Update pages/Model Creation with Optuna.py
6c0899e verified
import streamlit as st
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report
from imblearn.over_sampling import SMOTE
import optuna
from sklearn.neighbors import KNeighborsClassifier
# File uploader for dataset
uploaded_file = st.file_uploader("Upload your dataset (CSV format):", type=["csv"])
if uploaded_file is not None:
# Read and display the dataset
data = pd.read_csv(uploaded_file)
st.write("### Uploaded Dataset:")
st.dataframe(data)
# Dataset Overview
st.write("### Dataset Overview:")
st.write(data.describe())
# Missing values in the dataset
st.write("### Missing Values:")
st.write(data.isnull().sum())
# Select target column for classification
target_column = st.selectbox("Select target column", data.columns)
# Handle Encoding
encoding_method = st.selectbox("Select Encoding Method", ["None", "LabelEncoding", "OneHotEncoding"])
if encoding_method == "LabelEncoding":
label_encoder = LabelEncoder()
data = data.apply(lambda col: label_encoder.fit_transform(col) if col.dtype == 'object' else col)
st.write("Applied Label Encoding to categorical variables.")
elif encoding_method == "OneHotEncoding":
categorical_columns = data.select_dtypes(include=['object']).columns
data = pd.get_dummies(data, columns=categorical_columns)
st.write("Applied One-Hot Encoding to categorical variables.")
# Class imbalance check and handling with SMOTE
y = data[target_column]
X = data.drop(columns=[target_column])
value_counts = y.value_counts()
st.write(f"Class distribution in {target_column}:")
st.write(value_counts)
if value_counts.min() / value_counts.max() < 0.25:
smote = SMOTE(random_state=42)
X, y = smote.fit_resample(X, y)
st.write("Applied SMOTE for balancing classes.")
# Scaling
scaling_method = st.selectbox("Select Scaling Method", ["None", "StandardScaler", "MinMaxScaler"])
if scaling_method == "StandardScaler":
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
elif scaling_method == "MinMaxScaler":
scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(X)
else:
X_scaled = X # No scaling if selected as "None"
# Splitting data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.3, random_state=42)
# Model Selection options
algorithms = st.multiselect("Select algorithms", ["RandomForest", "LogisticRegression", "SVC", "KNN"])
# Metric selection
metrics = st.multiselect("Select evaluation metrics", ["Accuracy", "Precision", "Recall", "F1-score"])
# **Theory: Model Training and Selection with Optuna**
# Model training and selection is a crucial phase in machine learning. After completing the exploratory data analysis (EDA),
# the next step is to build and optimize predictive models. This section focuses on the following key aspects:
# **Data Splitting**: The dataset is divided into training and testing sets. The training set is used to train the model,
# while the testing set is used to evaluate its performance on unseen data.
# **Model Selection**: Various machine learning algorithms can be used for solving the problem. In this section, we will consider:
# - Logistic Regression: A statistical model commonly used for binary classification tasks.
# - K-Nearest Neighbors (KNN): A non-parametric algorithm used for classification based on distance metrics.
# **Data Preprocessing**: Before training the model, the data may need to be preprocessed. This includes scaling features using techniques like:
# - StandardScaler: Standardizes features by removing the mean and scaling to unit variance.
# - MinMaxScaler: Scales features to a specific range, typically between 0 and 1.
# **Hyperparameter Tuning with Optuna**: Optuna is an automatic hyperparameter optimization framework that allows us to efficiently
# search for the best hyperparameters for our models. It uses a technique called Bayesian Optimization to find the optimal set of hyperparameters
# that maximize the model's performance.
# **Model Evaluation**: After the model is trained and optimized, its performance is evaluated using appropriate metrics, such as accuracy, precision, recall, F1-score, etc.
# This section focuses on using Optuna for hyperparameter tuning, ensuring the model performs optimally before deployment.
# Optuna hyperparameter tuning function
def objective(trial):
# Select model type
model_type = trial.suggest_categorical("model", algorithms)
if model_type == "KNN":
n_neighbors = trial.suggest_int("n_neighbors", 1, 100)
p = trial.suggest_int("p", 1, 2)
model = KNeighborsClassifier(n_neighbors=n_neighbors, p=p)
elif model_type == "LogisticRegression":
solver, penalty = trial.suggest_categorical("solver_penalty", [
("lbfgs", "l2"), ("newton-cg", "l2"), ("sag", "l2"), ("saga", "l1"),
("saga", "l2"), ("saga", "elasticnet")])
C = trial.suggest_loguniform("C", 1e-5, 1e2)
if penalty == "elasticnet":
model = LogisticRegression(C=C, solver=solver, penalty=penalty, multi_class="multinomial", l1_ratio=0.3)
else:
model = LogisticRegression(C=C, solver=solver, penalty=penalty, multi_class="multinomial")
elif model_type == "RandomForest":
n_estimators = trial.suggest_int("n_estimators", 50, 200)
max_depth = trial.suggest_int("max_depth", 3, 10)
model = RandomForestClassifier(n_estimators=n_estimators, max_depth=max_depth, random_state=42)
elif model_type == "SVC":
C = trial.suggest_loguniform("C", 1e-5, 1e2)
kernel = trial.suggest_categorical("kernel", ["linear", "rbf"])
model = SVC(C=C, kernel=kernel, random_state=42)
# Cross-validation score
score = cross_val_score(model, X_train, y_train, cv=5, scoring="accuracy").mean()
return score
# Run Optuna optimization
if st.button("Start Hyperparameter Tuning"):
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=100)
st.write(f"Best trial: {study.best_trial.params}")
st.write(f"Best score: {study.best_trial.value}")
# Select best model and evaluate
best_model_type = study.best_trial.params['model']
if best_model_type == "KNN":
model = KNeighborsClassifier(n_neighbors=study.best_trial.params['n_neighbors'], p=study.best_trial.params['p'])
elif best_model_type == "LogisticRegression":
model = LogisticRegression(C=study.best_trial.params['C'], solver=study.best_trial.params['solver_penalty'][0],
penalty=study.best_trial.params['solver_penalty'][1], multi_class="multinomial")
elif best_model_type == "RandomForest":
model = RandomForestClassifier(n_estimators=study.best_trial.params['n_estimators'],
max_depth=study.best_trial.params['max_depth'], random_state=42)
elif best_model_type == "SVC":
model = SVC(C=study.best_trial.params['C'], kernel=study.best_trial.params['kernel'], random_state=42)
# Model training
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
# Evaluation
st.write("### Model Evaluation:")
if "Accuracy" in metrics:
accuracy = accuracy_score(y_test, y_pred)
st.write(f"Accuracy: {accuracy}")
if "Precision" in metrics:
precision = precision_score(y_test, y_pred, average='weighted')
st.write(f"Precision: {precision}")
if "Recall" in metrics:
recall = recall_score(y_test, y_pred, average='weighted')
st.write(f"Recall: {recall}")
if "F1-score" in metrics:
f1 = f1_score(y_test, y_pred, average='weighted')
st.write(f"F1-score: {f1}")
# Display classification report
st.write("### Classification Report:")
st.write(classification_report(y_test, y_pred))
else:
st.warning("Please upload a dataset to proceed with EDA.")