Spaces:

Mpavan45
/

Model

Sleeping

App Files Files Community

Mpavan45 commited on Jan 6, 2025

Commit

6c0899e

verified ·

1 Parent(s): f7209ee

Update pages/Model Creation with Optuna.py

Browse files

Files changed (1) hide show

pages/Model Creation with Optuna.py +167 -49

pages/Model Creation with Optuna.py CHANGED Viewed

@@ -1,58 +1,176 @@
 import streamlit as st
 import pandas as pd
-# import optuna
-# from sklearn.ensemble import RandomForestClassifier
-# from sklearn.model_selection import train_test_split, cross_val_score
-# from sklearn.metrics import classification_report
-# Title
-st.title("Model Creation and Hyperparameter Tuning with Optuna")
-st.markdown("""
-Upload your dataset, select features and target, and let Optuna optimize hyperparameters
-to train the best Random Forest model.
-""")
-# File uploader
-uploaded_file = st.file_uploader("Upload your prepared dataset (CSV format):", type=["csv"])
 if uploaded_file is not None:
     data = pd.read_csv(uploaded_file)
-    st.write("### Dataset:")
     st.dataframe(data)
-    # Feature and target selection
-    features = st.multiselect("Select Features:", options=data.columns)
-    target = st.selectbox("Select Target:", options=data.columns)
-    if features and target:
-        X = data[features]
-        y = data[target]
-        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
-        def objective(trial):
-            model = RandomForestClassifier(
-                n_estimators=trial.suggest_int("n_estimators", 10, 200),
-                max_depth=trial.suggest_int("max_depth", 2, 32, log=True),
-                min_samples_split=trial.suggest_int("min_samples_split", 2, 20),
-                min_samples_leaf=trial.suggest_int("min_samples_leaf", 1, 20),
-                random_state=42,
-            )
-            return cross_val_score(model, X_train, y_train, cv=3, scoring="accuracy").mean()
-        st.write("### Hyperparameter Tuning")
-        n_trials = st.slider("Number of Trials:", 10, 100, 20)
-        if st.button("Start Tuning"):
-            study = optuna.create_study(direction="maximize")
-            study.optimize(objective, n_trials=n_trials)
-            st.write("#### Best Parameters:")
-            st.json(study.best_params)
-            model = RandomForestClassifier(**study.best_params, random_state=42)
-            model.fit(X_train, y_train)
-            y_pred = model.predict(X_test)
-            st.write("### Model Performance:")
-            st.text(classification_report(y_test, y_pred))
 else:
-    st.warning("Upload a dataset to start.")

 import streamlit as st
 import pandas as pd
+import numpy as np
+from sklearn.model_selection import train_test_split, cross_val_score
+from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder
+from sklearn.ensemble import RandomForestClassifier
+from sklearn.linear_model import LogisticRegression
+from sklearn.svm import SVC
+from sklearn.metrics import accuracy_score, classification_report
+from imblearn.over_sampling import SMOTE
+import optuna
+from sklearn.neighbors import KNeighborsClassifier
+# File uploader for dataset
+uploaded_file = st.file_uploader("Upload your dataset (CSV format):", type=["csv"])
 if uploaded_file is not None:
+    # Read and display the dataset
     data = pd.read_csv(uploaded_file)
+    st.write("### Uploaded Dataset:")
     st.dataframe(data)
+    # Dataset Overview
+    st.write("### Dataset Overview:")
+    st.write(data.describe())
+    # Missing values in the dataset
+    st.write("### Missing Values:")
+    st.write(data.isnull().sum())
+    # Select target column for classification
+    target_column = st.selectbox("Select target column", data.columns)
+    # Handle Encoding
+    encoding_method = st.selectbox("Select Encoding Method", ["None", "LabelEncoding", "OneHotEncoding"])
+    if encoding_method == "LabelEncoding":
+        label_encoder = LabelEncoder()
+        data = data.apply(lambda col: label_encoder.fit_transform(col) if col.dtype == 'object' else col)
+        st.write("Applied Label Encoding to categorical variables.")
+    elif encoding_method == "OneHotEncoding":
+        categorical_columns = data.select_dtypes(include=['object']).columns
+        data = pd.get_dummies(data, columns=categorical_columns)
+        st.write("Applied One-Hot Encoding to categorical variables.")
+    # Class imbalance check and handling with SMOTE
+    y = data[target_column]
+    X = data.drop(columns=[target_column])
+    value_counts = y.value_counts()
+    st.write(f"Class distribution in {target_column}:")
+    st.write(value_counts)
+    if value_counts.min() / value_counts.max() < 0.25:
+        smote = SMOTE(random_state=42)
+        X, y = smote.fit_resample(X, y)
+        st.write("Applied SMOTE for balancing classes.")
+    # Scaling
+    scaling_method = st.selectbox("Select Scaling Method", ["None", "StandardScaler", "MinMaxScaler"])
+    if scaling_method == "StandardScaler":
+        scaler = StandardScaler()
+        X_scaled = scaler.fit_transform(X)
+    elif scaling_method == "MinMaxScaler":
+        scaler = MinMaxScaler()
+        X_scaled = scaler.fit_transform(X)
+    else:
+        X_scaled = X  # No scaling if selected as "None"
+    # Splitting data into training and testing sets
+    X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.3, random_state=42)
+    # Model Selection options
+    algorithms = st.multiselect("Select algorithms", ["RandomForest", "LogisticRegression", "SVC", "KNN"])
+    # Metric selection
+    metrics = st.multiselect("Select evaluation metrics", ["Accuracy", "Precision", "Recall", "F1-score"])
+    # **Theory: Model Training and Selection with Optuna**
+    # Model training and selection is a crucial phase in machine learning. After completing the exploratory data analysis (EDA),
+    # the next step is to build and optimize predictive models. This section focuses on the following key aspects:
+    # **Data Splitting**: The dataset is divided into training and testing sets. The training set is used to train the model,
+    # while the testing set is used to evaluate its performance on unseen data.
+    # **Model Selection**: Various machine learning algorithms can be used for solving the problem. In this section, we will consider:
+    # - Logistic Regression: A statistical model commonly used for binary classification tasks.
+    # - K-Nearest Neighbors (KNN): A non-parametric algorithm used for classification based on distance metrics.
+    # **Data Preprocessing**: Before training the model, the data may need to be preprocessed. This includes scaling features using techniques like:
+    # - StandardScaler: Standardizes features by removing the mean and scaling to unit variance.
+    # - MinMaxScaler: Scales features to a specific range, typically between 0 and 1.
+    # **Hyperparameter Tuning with Optuna**: Optuna is an automatic hyperparameter optimization framework that allows us to efficiently
+    # search for the best hyperparameters for our models. It uses a technique called Bayesian Optimization to find the optimal set of hyperparameters
+    # that maximize the model's performance.
+    # **Model Evaluation**: After the model is trained and optimized, its performance is evaluated using appropriate metrics, such as accuracy, precision, recall, F1-score, etc.
+    # This section focuses on using Optuna for hyperparameter tuning, ensuring the model performs optimally before deployment.
+    # Optuna hyperparameter tuning function
+    def objective(trial):
+        # Select model type
+        model_type = trial.suggest_categorical("model", algorithms)
+        if model_type == "KNN":
+            n_neighbors = trial.suggest_int("n_neighbors", 1, 100)
+            p = trial.suggest_int("p", 1, 2)
+            model = KNeighborsClassifier(n_neighbors=n_neighbors, p=p)
+        elif model_type == "LogisticRegression":
+            solver, penalty = trial.suggest_categorical("solver_penalty", [
+                ("lbfgs", "l2"), ("newton-cg", "l2"), ("sag", "l2"), ("saga", "l1"),
+                ("saga", "l2"), ("saga", "elasticnet")])
+            C = trial.suggest_loguniform("C", 1e-5, 1e2)
+            if penalty == "elasticnet":
+                model = LogisticRegression(C=C, solver=solver, penalty=penalty, multi_class="multinomial", l1_ratio=0.3)
+            else:
+                model = LogisticRegression(C=C, solver=solver, penalty=penalty, multi_class="multinomial")
+        elif model_type == "RandomForest":
+            n_estimators = trial.suggest_int("n_estimators", 50, 200)
+            max_depth = trial.suggest_int("max_depth", 3, 10)
+            model = RandomForestClassifier(n_estimators=n_estimators, max_depth=max_depth, random_state=42)
+        elif model_type == "SVC":
+            C = trial.suggest_loguniform("C", 1e-5, 1e2)
+            kernel = trial.suggest_categorical("kernel", ["linear", "rbf"])
+            model = SVC(C=C, kernel=kernel, random_state=42)
+        # Cross-validation score
+        score = cross_val_score(model, X_train, y_train, cv=5, scoring="accuracy").mean()
+        return score
+    # Run Optuna optimization
+    if st.button("Start Hyperparameter Tuning"):
+        study = optuna.create_study(direction="maximize")
+        study.optimize(objective, n_trials=100)
+        st.write(f"Best trial: {study.best_trial.params}")
+        st.write(f"Best score: {study.best_trial.value}")
+        # Select best model and evaluate
+        best_model_type = study.best_trial.params['model']
+        if best_model_type == "KNN":
+            model = KNeighborsClassifier(n_neighbors=study.best_trial.params['n_neighbors'], p=study.best_trial.params['p'])
+        elif best_model_type == "LogisticRegression":
+            model = LogisticRegression(C=study.best_trial.params['C'], solver=study.best_trial.params['solver_penalty'][0],
+                                       penalty=study.best_trial.params['solver_penalty'][1], multi_class="multinomial")
+        elif best_model_type == "RandomForest":
+            model = RandomForestClassifier(n_estimators=study.best_trial.params['n_estimators'],
+                                           max_depth=study.best_trial.params['max_depth'], random_state=42)
+        elif best_model_type == "SVC":
+            model = SVC(C=study.best_trial.params['C'], kernel=study.best_trial.params['kernel'], random_state=42)
+        # Model training
+        model.fit(X_train, y_train)
+        y_pred = model.predict(X_test)
+        # Evaluation
+        st.write("### Model Evaluation:")
+        if "Accuracy" in metrics:
+            accuracy = accuracy_score(y_test, y_pred)
+            st.write(f"Accuracy: {accuracy}")
+        if "Precision" in metrics:
+            precision = precision_score(y_test, y_pred, average='weighted')
+            st.write(f"Precision: {precision}")
+        if "Recall" in metrics:
+            recall = recall_score(y_test, y_pred, average='weighted')
+            st.write(f"Recall: {recall}")
+        if "F1-score" in metrics:
+            f1 = f1_score(y_test, y_pred, average='weighted')
+            st.write(f"F1-score: {f1}")
+        # Display classification report
+        st.write("### Classification Report:")
+        st.write(classification_report(y_test, y_pred))
 else:
+    st.warning("Please upload a dataset to proceed with EDA.")