Mpavan45 commited on
Commit
6c0899e
·
verified ·
1 Parent(s): f7209ee

Update pages/Model Creation with Optuna.py

Browse files
Files changed (1) hide show
  1. pages/Model Creation with Optuna.py +167 -49
pages/Model Creation with Optuna.py CHANGED
@@ -1,58 +1,176 @@
1
  import streamlit as st
2
  import pandas as pd
3
- # import optuna
4
- # from sklearn.ensemble import RandomForestClassifier
5
- # from sklearn.model_selection import train_test_split, cross_val_score
6
- # from sklearn.metrics import classification_report
 
 
 
 
 
 
7
 
8
- # Title
9
- st.title("Model Creation and Hyperparameter Tuning with Optuna")
10
- st.markdown("""
11
- Upload your dataset, select features and target, and let Optuna optimize hyperparameters
12
- to train the best Random Forest model.
13
- """)
14
-
15
- # File uploader
16
- uploaded_file = st.file_uploader("Upload your prepared dataset (CSV format):", type=["csv"])
17
 
18
  if uploaded_file is not None:
 
19
  data = pd.read_csv(uploaded_file)
20
- st.write("### Dataset:")
21
  st.dataframe(data)
22
 
23
- # Feature and target selection
24
- features = st.multiselect("Select Features:", options=data.columns)
25
- target = st.selectbox("Select Target:", options=data.columns)
26
-
27
- if features and target:
28
- X = data[features]
29
- y = data[target]
30
- X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
31
-
32
- def objective(trial):
33
- model = RandomForestClassifier(
34
- n_estimators=trial.suggest_int("n_estimators", 10, 200),
35
- max_depth=trial.suggest_int("max_depth", 2, 32, log=True),
36
- min_samples_split=trial.suggest_int("min_samples_split", 2, 20),
37
- min_samples_leaf=trial.suggest_int("min_samples_leaf", 1, 20),
38
- random_state=42,
39
- )
40
- return cross_val_score(model, X_train, y_train, cv=3, scoring="accuracy").mean()
41
-
42
- st.write("### Hyperparameter Tuning")
43
- n_trials = st.slider("Number of Trials:", 10, 100, 20)
44
- if st.button("Start Tuning"):
45
- study = optuna.create_study(direction="maximize")
46
- study.optimize(objective, n_trials=n_trials)
47
-
48
- st.write("#### Best Parameters:")
49
- st.json(study.best_params)
50
-
51
- model = RandomForestClassifier(**study.best_params, random_state=42)
52
- model.fit(X_train, y_train)
53
-
54
- y_pred = model.predict(X_test)
55
- st.write("### Model Performance:")
56
- st.text(classification_report(y_test, y_pred))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
57
  else:
58
- st.warning("Upload a dataset to start.")
 
1
  import streamlit as st
2
  import pandas as pd
3
+ import numpy as np
4
+ from sklearn.model_selection import train_test_split, cross_val_score
5
+ from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder
6
+ from sklearn.ensemble import RandomForestClassifier
7
+ from sklearn.linear_model import LogisticRegression
8
+ from sklearn.svm import SVC
9
+ from sklearn.metrics import accuracy_score, classification_report
10
+ from imblearn.over_sampling import SMOTE
11
+ import optuna
12
+ from sklearn.neighbors import KNeighborsClassifier
13
 
14
+ # File uploader for dataset
15
+ uploaded_file = st.file_uploader("Upload your dataset (CSV format):", type=["csv"])
 
 
 
 
 
 
 
16
 
17
  if uploaded_file is not None:
18
+ # Read and display the dataset
19
  data = pd.read_csv(uploaded_file)
20
+ st.write("### Uploaded Dataset:")
21
  st.dataframe(data)
22
 
23
+ # Dataset Overview
24
+ st.write("### Dataset Overview:")
25
+ st.write(data.describe())
26
+
27
+ # Missing values in the dataset
28
+ st.write("### Missing Values:")
29
+ st.write(data.isnull().sum())
30
+
31
+ # Select target column for classification
32
+ target_column = st.selectbox("Select target column", data.columns)
33
+
34
+ # Handle Encoding
35
+ encoding_method = st.selectbox("Select Encoding Method", ["None", "LabelEncoding", "OneHotEncoding"])
36
+ if encoding_method == "LabelEncoding":
37
+ label_encoder = LabelEncoder()
38
+ data = data.apply(lambda col: label_encoder.fit_transform(col) if col.dtype == 'object' else col)
39
+ st.write("Applied Label Encoding to categorical variables.")
40
+ elif encoding_method == "OneHotEncoding":
41
+ categorical_columns = data.select_dtypes(include=['object']).columns
42
+ data = pd.get_dummies(data, columns=categorical_columns)
43
+ st.write("Applied One-Hot Encoding to categorical variables.")
44
+
45
+ # Class imbalance check and handling with SMOTE
46
+ y = data[target_column]
47
+ X = data.drop(columns=[target_column])
48
+ value_counts = y.value_counts()
49
+ st.write(f"Class distribution in {target_column}:")
50
+ st.write(value_counts)
51
+ if value_counts.min() / value_counts.max() < 0.25:
52
+ smote = SMOTE(random_state=42)
53
+ X, y = smote.fit_resample(X, y)
54
+ st.write("Applied SMOTE for balancing classes.")
55
+
56
+ # Scaling
57
+ scaling_method = st.selectbox("Select Scaling Method", ["None", "StandardScaler", "MinMaxScaler"])
58
+ if scaling_method == "StandardScaler":
59
+ scaler = StandardScaler()
60
+ X_scaled = scaler.fit_transform(X)
61
+ elif scaling_method == "MinMaxScaler":
62
+ scaler = MinMaxScaler()
63
+ X_scaled = scaler.fit_transform(X)
64
+ else:
65
+ X_scaled = X # No scaling if selected as "None"
66
+
67
+ # Splitting data into training and testing sets
68
+ X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.3, random_state=42)
69
+
70
+ # Model Selection options
71
+ algorithms = st.multiselect("Select algorithms", ["RandomForest", "LogisticRegression", "SVC", "KNN"])
72
+
73
+ # Metric selection
74
+ metrics = st.multiselect("Select evaluation metrics", ["Accuracy", "Precision", "Recall", "F1-score"])
75
+
76
+ # **Theory: Model Training and Selection with Optuna**
77
+ # Model training and selection is a crucial phase in machine learning. After completing the exploratory data analysis (EDA),
78
+ # the next step is to build and optimize predictive models. This section focuses on the following key aspects:
79
+
80
+ # **Data Splitting**: The dataset is divided into training and testing sets. The training set is used to train the model,
81
+ # while the testing set is used to evaluate its performance on unseen data.
82
+
83
+ # **Model Selection**: Various machine learning algorithms can be used for solving the problem. In this section, we will consider:
84
+ # - Logistic Regression: A statistical model commonly used for binary classification tasks.
85
+ # - K-Nearest Neighbors (KNN): A non-parametric algorithm used for classification based on distance metrics.
86
+
87
+ # **Data Preprocessing**: Before training the model, the data may need to be preprocessed. This includes scaling features using techniques like:
88
+ # - StandardScaler: Standardizes features by removing the mean and scaling to unit variance.
89
+ # - MinMaxScaler: Scales features to a specific range, typically between 0 and 1.
90
+
91
+ # **Hyperparameter Tuning with Optuna**: Optuna is an automatic hyperparameter optimization framework that allows us to efficiently
92
+ # search for the best hyperparameters for our models. It uses a technique called Bayesian Optimization to find the optimal set of hyperparameters
93
+ # that maximize the model's performance.
94
+
95
+ # **Model Evaluation**: After the model is trained and optimized, its performance is evaluated using appropriate metrics, such as accuracy, precision, recall, F1-score, etc.
96
+
97
+ # This section focuses on using Optuna for hyperparameter tuning, ensuring the model performs optimally before deployment.
98
+
99
+ # Optuna hyperparameter tuning function
100
+ def objective(trial):
101
+ # Select model type
102
+ model_type = trial.suggest_categorical("model", algorithms)
103
+
104
+ if model_type == "KNN":
105
+ n_neighbors = trial.suggest_int("n_neighbors", 1, 100)
106
+ p = trial.suggest_int("p", 1, 2)
107
+ model = KNeighborsClassifier(n_neighbors=n_neighbors, p=p)
108
+
109
+ elif model_type == "LogisticRegression":
110
+ solver, penalty = trial.suggest_categorical("solver_penalty", [
111
+ ("lbfgs", "l2"), ("newton-cg", "l2"), ("sag", "l2"), ("saga", "l1"),
112
+ ("saga", "l2"), ("saga", "elasticnet")])
113
+ C = trial.suggest_loguniform("C", 1e-5, 1e2)
114
+ if penalty == "elasticnet":
115
+ model = LogisticRegression(C=C, solver=solver, penalty=penalty, multi_class="multinomial", l1_ratio=0.3)
116
+ else:
117
+ model = LogisticRegression(C=C, solver=solver, penalty=penalty, multi_class="multinomial")
118
+
119
+ elif model_type == "RandomForest":
120
+ n_estimators = trial.suggest_int("n_estimators", 50, 200)
121
+ max_depth = trial.suggest_int("max_depth", 3, 10)
122
+ model = RandomForestClassifier(n_estimators=n_estimators, max_depth=max_depth, random_state=42)
123
+
124
+ elif model_type == "SVC":
125
+ C = trial.suggest_loguniform("C", 1e-5, 1e2)
126
+ kernel = trial.suggest_categorical("kernel", ["linear", "rbf"])
127
+ model = SVC(C=C, kernel=kernel, random_state=42)
128
+
129
+ # Cross-validation score
130
+ score = cross_val_score(model, X_train, y_train, cv=5, scoring="accuracy").mean()
131
+ return score
132
+
133
+ # Run Optuna optimization
134
+ if st.button("Start Hyperparameter Tuning"):
135
+ study = optuna.create_study(direction="maximize")
136
+ study.optimize(objective, n_trials=100)
137
+ st.write(f"Best trial: {study.best_trial.params}")
138
+ st.write(f"Best score: {study.best_trial.value}")
139
+
140
+ # Select best model and evaluate
141
+ best_model_type = study.best_trial.params['model']
142
+ if best_model_type == "KNN":
143
+ model = KNeighborsClassifier(n_neighbors=study.best_trial.params['n_neighbors'], p=study.best_trial.params['p'])
144
+ elif best_model_type == "LogisticRegression":
145
+ model = LogisticRegression(C=study.best_trial.params['C'], solver=study.best_trial.params['solver_penalty'][0],
146
+ penalty=study.best_trial.params['solver_penalty'][1], multi_class="multinomial")
147
+ elif best_model_type == "RandomForest":
148
+ model = RandomForestClassifier(n_estimators=study.best_trial.params['n_estimators'],
149
+ max_depth=study.best_trial.params['max_depth'], random_state=42)
150
+ elif best_model_type == "SVC":
151
+ model = SVC(C=study.best_trial.params['C'], kernel=study.best_trial.params['kernel'], random_state=42)
152
+
153
+ # Model training
154
+ model.fit(X_train, y_train)
155
+ y_pred = model.predict(X_test)
156
+
157
+ # Evaluation
158
+ st.write("### Model Evaluation:")
159
+ if "Accuracy" in metrics:
160
+ accuracy = accuracy_score(y_test, y_pred)
161
+ st.write(f"Accuracy: {accuracy}")
162
+ if "Precision" in metrics:
163
+ precision = precision_score(y_test, y_pred, average='weighted')
164
+ st.write(f"Precision: {precision}")
165
+ if "Recall" in metrics:
166
+ recall = recall_score(y_test, y_pred, average='weighted')
167
+ st.write(f"Recall: {recall}")
168
+ if "F1-score" in metrics:
169
+ f1 = f1_score(y_test, y_pred, average='weighted')
170
+ st.write(f"F1-score: {f1}")
171
+
172
+ # Display classification report
173
+ st.write("### Classification Report:")
174
+ st.write(classification_report(y_test, y_pred))
175
  else:
176
+ st.warning("Please upload a dataset to proceed with EDA.")