Spaces:

Mpavan45
/

Model

Sleeping

App Files Files Community

Model / pages /Model Creation with Optuna.py

Mpavan45

Update pages/Model Creation with Optuna.py

6c0899e verified 12 months ago

raw

history blame contribute delete

8.75 kB

	import streamlit as st
	import pandas as pd
	import numpy as np
	from sklearn.model_selection import train_test_split, cross_val_score
	from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder
	from sklearn.ensemble import RandomForestClassifier
	from sklearn.linear_model import LogisticRegression
	from sklearn.svm import SVC
	from sklearn.metrics import accuracy_score, classification_report
	from imblearn.over_sampling import SMOTE
	import optuna
	from sklearn.neighbors import KNeighborsClassifier

	# File uploader for dataset
	uploaded_file = st.file_uploader("Upload your dataset (CSV format):", type=["csv"])

	if uploaded_file is not None:
	# Read and display the dataset
	data = pd.read_csv(uploaded_file)
	st.write("### Uploaded Dataset:")
	st.dataframe(data)

	# Dataset Overview
	st.write("### Dataset Overview:")
	st.write(data.describe())

	# Missing values in the dataset
	st.write("### Missing Values:")
	st.write(data.isnull().sum())

	# Select target column for classification
	target_column = st.selectbox("Select target column", data.columns)

	# Handle Encoding
	encoding_method = st.selectbox("Select Encoding Method", ["None", "LabelEncoding", "OneHotEncoding"])
	if encoding_method == "LabelEncoding":
	label_encoder = LabelEncoder()
	data = data.apply(lambda col: label_encoder.fit_transform(col) if col.dtype == 'object' else col)
	st.write("Applied Label Encoding to categorical variables.")
	elif encoding_method == "OneHotEncoding":
	categorical_columns = data.select_dtypes(include=['object']).columns
	data = pd.get_dummies(data, columns=categorical_columns)
	st.write("Applied One-Hot Encoding to categorical variables.")

	# Class imbalance check and handling with SMOTE
	y = data[target_column]
	X = data.drop(columns=[target_column])
	value_counts = y.value_counts()
	st.write(f"Class distribution in {target_column}:")
	st.write(value_counts)
	if value_counts.min() / value_counts.max() < 0.25:
	smote = SMOTE(random_state=42)
	X, y = smote.fit_resample(X, y)
	st.write("Applied SMOTE for balancing classes.")

	# Scaling
	scaling_method = st.selectbox("Select Scaling Method", ["None", "StandardScaler", "MinMaxScaler"])
	if scaling_method == "StandardScaler":
	scaler = StandardScaler()
	X_scaled = scaler.fit_transform(X)
	elif scaling_method == "MinMaxScaler":
	scaler = MinMaxScaler()
	X_scaled = scaler.fit_transform(X)
	else:
	X_scaled = X # No scaling if selected as "None"

	# Splitting data into training and testing sets
	X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.3, random_state=42)

	# Model Selection options
	algorithms = st.multiselect("Select algorithms", ["RandomForest", "LogisticRegression", "SVC", "KNN"])

	# Metric selection
	metrics = st.multiselect("Select evaluation metrics", ["Accuracy", "Precision", "Recall", "F1-score"])

	# Theory: Model Training and Selection with Optuna
	# Model training and selection is a crucial phase in machine learning. After completing the exploratory data analysis (EDA),
	# the next step is to build and optimize predictive models. This section focuses on the following key aspects:

	# Data Splitting: The dataset is divided into training and testing sets. The training set is used to train the model,
	# while the testing set is used to evaluate its performance on unseen data.

	# Model Selection: Various machine learning algorithms can be used for solving the problem. In this section, we will consider:
	# - Logistic Regression: A statistical model commonly used for binary classification tasks.
	# - K-Nearest Neighbors (KNN): A non-parametric algorithm used for classification based on distance metrics.

	# Data Preprocessing: Before training the model, the data may need to be preprocessed. This includes scaling features using techniques like:
	# - StandardScaler: Standardizes features by removing the mean and scaling to unit variance.
	# - MinMaxScaler: Scales features to a specific range, typically between 0 and 1.

	# Hyperparameter Tuning with Optuna: Optuna is an automatic hyperparameter optimization framework that allows us to efficiently
	# search for the best hyperparameters for our models. It uses a technique called Bayesian Optimization to find the optimal set of hyperparameters
	# that maximize the model's performance.

	# Model Evaluation: After the model is trained and optimized, its performance is evaluated using appropriate metrics, such as accuracy, precision, recall, F1-score, etc.

	# This section focuses on using Optuna for hyperparameter tuning, ensuring the model performs optimally before deployment.

	# Optuna hyperparameter tuning function
	def objective(trial):
	# Select model type
	model_type = trial.suggest_categorical("model", algorithms)

	if model_type == "KNN":
	n_neighbors = trial.suggest_int("n_neighbors", 1, 100)
	p = trial.suggest_int("p", 1, 2)
	model = KNeighborsClassifier(n_neighbors=n_neighbors, p=p)

	elif model_type == "LogisticRegression":
	solver, penalty = trial.suggest_categorical("solver_penalty", [
	("lbfgs", "l2"), ("newton-cg", "l2"), ("sag", "l2"), ("saga", "l1"),
	("saga", "l2"), ("saga", "elasticnet")])
	C = trial.suggest_loguniform("C", 1e-5, 1e2)
	if penalty == "elasticnet":
	model = LogisticRegression(C=C, solver=solver, penalty=penalty, multi_class="multinomial", l1_ratio=0.3)
	else:
	model = LogisticRegression(C=C, solver=solver, penalty=penalty, multi_class="multinomial")

	elif model_type == "RandomForest":
	n_estimators = trial.suggest_int("n_estimators", 50, 200)
	max_depth = trial.suggest_int("max_depth", 3, 10)
	model = RandomForestClassifier(n_estimators=n_estimators, max_depth=max_depth, random_state=42)

	elif model_type == "SVC":
	C = trial.suggest_loguniform("C", 1e-5, 1e2)
	kernel = trial.suggest_categorical("kernel", ["linear", "rbf"])
	model = SVC(C=C, kernel=kernel, random_state=42)

	# Cross-validation score
	score = cross_val_score(model, X_train, y_train, cv=5, scoring="accuracy").mean()
	return score

	# Run Optuna optimization
	if st.button("Start Hyperparameter Tuning"):
	study = optuna.create_study(direction="maximize")
	study.optimize(objective, n_trials=100)
	st.write(f"Best trial: {study.best_trial.params}")
	st.write(f"Best score: {study.best_trial.value}")

	# Select best model and evaluate
	best_model_type = study.best_trial.params['model']
	if best_model_type == "KNN":
	model = KNeighborsClassifier(n_neighbors=study.best_trial.params['n_neighbors'], p=study.best_trial.params['p'])
	elif best_model_type == "LogisticRegression":
	model = LogisticRegression(C=study.best_trial.params['C'], solver=study.best_trial.params['solver_penalty'][0],
	penalty=study.best_trial.params['solver_penalty'][1], multi_class="multinomial")
	elif best_model_type == "RandomForest":
	model = RandomForestClassifier(n_estimators=study.best_trial.params['n_estimators'],
	max_depth=study.best_trial.params['max_depth'], random_state=42)
	elif best_model_type == "SVC":
	model = SVC(C=study.best_trial.params['C'], kernel=study.best_trial.params['kernel'], random_state=42)

	# Model training
	model.fit(X_train, y_train)
	y_pred = model.predict(X_test)

	# Evaluation
	st.write("### Model Evaluation:")
	if "Accuracy" in metrics:
	accuracy = accuracy_score(y_test, y_pred)
	st.write(f"Accuracy: {accuracy}")
	if "Precision" in metrics:
	precision = precision_score(y_test, y_pred, average='weighted')
	st.write(f"Precision: {precision}")
	if "Recall" in metrics:
	recall = recall_score(y_test, y_pred, average='weighted')
	st.write(f"Recall: {recall}")
	if "F1-score" in metrics:
	f1 = f1_score(y_test, y_pred, average='weighted')
	st.write(f"F1-score: {f1}")

	# Display classification report
	st.write("### Classification Report:")
	st.write(classification_report(y_test, y_pred))
	else:
	st.warning("Please upload a dataset to proceed with EDA.")