DataFlowPro / src /model_trainer.py
boringnose's picture
Upload 16 files
e2eacae verified
# Contains classes and functions for model
# building, hyperparameter tuning, and training models.
import numpy as np
from sklearn.model_selection import GridSearchCV
from joblib import dump # For saving models
from src.config import model_dict
import streamlit as st
class ModelTrainer:
def __init__(self, json_content: dict):
self.json_content = json_content
self.k_fold = json_content["design_state_data"]["train"]["k_fold"]
if not self.k_fold:
self.k_fold = None
self.random_state = [42]
self.num_iter = 3
def tune_random_forest(self, model, X_train, y_train, model_name, model_parameters):
params = {"random_state": self.random_state}
min_trees = model_parameters[model_name]["min_trees"]
max_trees = model_parameters[model_name]["max_trees"]
params["n_estimators"] = np.linspace(min_trees, max_trees, self.num_iter, dtype=int)
min_depth = model_parameters[model_name]["min_depth"]
max_depth = model_parameters[model_name]["max_depth"]
params["max_depth"] = np.linspace(min_depth, max_depth, self.num_iter, dtype=int)
min_samples_per_leaf = model_parameters[model_name]["min_samples_per_leaf_min_value"]
max_samples_per_leaf = model_parameters[model_name]["min_samples_per_leaf_max_value"]
params["min_samples_leaf"] = np.linspace(min_samples_per_leaf, max_samples_per_leaf, self.num_iter, dtype=int)
if model_parameters[model_name].get("random_state"):
params["random_state"] = model_parameters[model_name]["random_state"]
gcv = GridSearchCV(model, params, cv=self.k_fold)
gcv.fit(X_train, y_train)
return gcv.best_estimator_
def tune_linear_elasticnet_regression(self, model, X_train, y_train, model_name, model_parameters):
params = {"random_state": self.random_state}
if model_parameters[model_name].get("random_state"):
params["random_state"] = model_parameters[model_name]["random_state"]
min_iter = model_parameters[model_name]["min_iter"]
max_iter = model_parameters[model_name]["max_iter"]
params["max_iter"] = np.linspace(min_iter, max_iter, self.num_iter, dtype=int)
min_reg = model_parameters[model_name]["min_regparam"]
max_reg = model_parameters[model_name]["max_regparam"]
params["alpha"] = np.logspace(min_reg, max_reg, self.num_iter)
min_elasticnet = model_parameters[model_name]["min_elasticnet"]
max_elasticnet = model_parameters[model_name]["max_elasticnet"]
params["l1_ratio"] = np.linspace(min_elasticnet, max_elasticnet, self.num_iter)
gcv = GridSearchCV(model, params, cv=self.k_fold)
gcv.fit(X_train, y_train)
return gcv.best_estimator_
def tune_logistic_regression(self, model, X_train, y_train, model_parameters):
params = {"random_state": self.random_state}
if model_parameters["LogisticRegression"].get("random_state"):
params["random_state"] = model_parameters["LogisticRegression"]["random_state"]
min_iter = model_parameters["LogisticRegression"]["min_iter"]
max_iter = model_parameters["LogisticRegression"]["max_iter"]
params["max_iter"] = np.linspace(min_iter, max_iter, self.num_iter, dtype=int)
min_reg = model_parameters["LogisticRegression"]["min_regparam"]
max_reg = model_parameters["LogisticRegression"]["max_regparam"]
params["C"] = np.logspace(min_reg, max_reg, self.num_iter)
min_elasticnet = model_parameters["LogisticRegression"]["min_elasticnet"]
max_elasticnet = model_parameters["LogisticRegression"]["max_elasticnet"]
params["l1_ratio"] = np.linspace(min_elasticnet, max_elasticnet, self.num_iter)
gcv = GridSearchCV(model, params, cv=self.k_fold)
gcv.fit(X_train, y_train)
return gcv.best_estimator_
def tune_ridge_lasso_regression(self, model, X_train, y_train, model_name, model_parameters):
params = {"random_state": self.random_state}
if model_parameters[model_name].get("random_state"):
params["random_state"] = model_parameters[model_name]["random_state"]
min_iter = model_parameters[model_name]["min_iter"]
max_iter = model_parameters[model_name]["max_iter"]
params["max_iter"] = np.linspace(min_iter, max_iter, self.num_iter, dtype=int)
min_regparam = model_parameters[model_name]["min_regparam"]
max_regparam = model_parameters[model_name]["max_regparam"]
params["alpha"] = np.logspace(min_regparam, max_regparam, self.num_iter)
gcv = GridSearchCV(model, params, cv=self.k_fold)
gcv.fit(X_train, y_train)
return gcv.best_estimator_
def tune_decision_tree(self, model, X_train, y_train, model_name, model_parameters):
params = {"random_state": self.random_state}
if model_parameters[model_name].get("random_state"):
params["random_state"] = model_parameters[model_name]["random_state"]
min_depth = model_parameters[model_name]["min_depth"]
max_depth = model_parameters[model_name]["max_depth"]
params["max_depth"] = np.linspace(min_depth, max_depth, self.num_iter, dtype=int)
criterion = []
if model_parameters[model_name]["use_gini"]:
criterion.append("gini")
if model_parameters[model_name]["use_entropy"]:
criterion.append("entropy")
params["criterion"] = criterion
splitter = []
if model_parameters[model_name]["use_random"]:
splitter.append("random")
if model_parameters[model_name]["use_best"]:
splitter.append("best")
params["splitter"] = splitter
if model_parameters[model_name].get("min_samples_per_leaf"):
params["min_samples_leaf"] = model_parameters[model_name]["min_samples_per_leaf"]
gcv = GridSearchCV(model, params, cv=self.k_fold)
gcv.fit(X_train, y_train)
return gcv.best_estimator_
def tune_svm(self, model, X_train, y_train, model_parameters):
params = {}
kernel = []
if model_parameters["SVM"]["linear_kernel"]:
kernel.append("linear")
if model_parameters["SVM"]["rep_kernel"]:
kernel.append("rbf")
if model_parameters["SVM"]["polynomial_kernel"]:
kernel.append("poly")
if model_parameters["SVM"]["sigmoid_kernel"]:
kernel.append("sigmoid")
params["kernel"] = kernel
params["C"] = model_parameters["SVM"]["c_value"]
gamma = []
if model_parameters["SVM"]["scale"]:
gamma.append("scale")
if model_parameters["SVM"]["auto"]:
gamma.append("auto")
params["gamma"] = gamma
params["max_iter"] = model_parameters["SVM"]["max_iterations"]
params["tol"] = model_parameters["SVM"]["tolerance"]
gcv = GridSearchCV(model, params, cv=self.k_fold)
gcv.fit(X_train, y_train)
return gcv.best_estimator_
def tune_knn(self, model, X_train, y_train, model_parameters):
params = {}
params["n_neighbors"] = model_parameters["KNN"]["k_value"]
if model_parameters["KNN"].get("distance_weighting"):
params["weights"] = ["distance"]
if model_parameters["KNN"]["neighbour_finding_algorithm"] == "Automatic":
params["algorithm"] = "auto"
gcv = GridSearchCV(model, params, cv=self.k_fold)
gcv.fit(X_train, y_train)
return gcv.best_estimator_
pass
def tune_neural_network(self, model, X_train, y_train, model_parameters):
parameters = model_parameters["neural_network"]
params = {"random_state": self.random_state,
"hidden_layer_sizes": parameters["hidden_layer_sizes"],
"alpha": parameters["alpha_value"],
"max_iter": parameters["max_iterations"],
"tol": parameters["convergence_tolerance"],
"early_stopping": parameters["early_stopping"],
"solver": parameters["solver"],
"shuffle": parameters["shuffle_data"],
"learning_rate_init": parameters["initial_learning_rate"],
"batch_size": parameters["automatic_batching"],
"beta_1": parameters["beta_1"],
"beta_2": parameters["beta_2"],
"epsilon": parameters["epsilon"],
"power_t": parameters["power_t"],
"momentum": parameters["momentum"],
"nesterovs_momentum": parameters["use_nesterov_momentum"],
}
if parameters.get("random_state"):
params["random_state"] = parameters["random_state"]
if parameters.get("activation"):
params["activation"] = parameters["activation"]
gcv = GridSearchCV(model, params, cv=self.k_fold)
gcv.fit(X_train, y_train)
return gcv.best_estimator_
def tune_xgb(self, model, X_train, y_train, model_name, model_parameters):
params = {"random_state": self.random_state,
"booster": []
}
if model_parameters["xg_boost"].get("random_state"):
params["random_state"] = model_parameters["xg_boost"]["random_state"]
if model_parameters["xg_boost"].get("use_gradient_boosted_tree"):
params["booster"].append("gbtree")
if model_parameters["xg_boost"].get("dart"):
params["booster"].append("dart")
params["n_estimators"] = model_parameters["xg_boost"]["max_num_of_trees"]
params["tree_method"] = model_parameters["xg_boost"]["tree_method"]
if model_parameters["xg_boost"]["early_stopping"]:
params["early_stopping_rounds"] = model_parameters["xg_boost"]["early_stopping_rounds"]
params["max_depth"] = model_parameters["xg_boost"]["max_depth_of_tree"]
params["learning_rate"] = model_parameters["xg_boost"]["learningRate"]
params["l1_regularization"] = model_parameters["xg_boost"]["l1_regularization"]
params["l2_regularization"] = model_parameters["xg_boost"]["l2_regularization"]
params["min_child_weight"] = model_parameters["xg_boost"]["min_child_weight"]
params["gamma"] = model_parameters["xg_boost"]["gamma"]
params["sub_sample"] = model_parameters["xg_boost"]["sub_sample"]
params["col_sample_by_tree"] = model_parameters["xg_boost"]["col_sample_by_tree"]
gcv = GridSearchCV(model, params, cv=self.k_fold)
gcv.fit(X_train, y_train)
return gcv.best_estimator_
def build_and_tune_model(self, X_train, y_train, problem_type, selected_models, model_parameters):
self.best_models = {}
for model_name in selected_models:
if model_name == "xg_boost":
st.warning("As of now xg_boost is not supported")
continue
if model_name == "SVM" and problem_type == "Regression":
model = model_dict["SVMRegressor"]
best_model = self.tune_svm(model, X_train, y_train, model_parameters)
elif model_name == "SVM" and problem_type == "Classification":
model = model_dict["SVMClassifier"]
best_model = self.tune_svm(model, X_train, y_train, model_parameters)
elif model_name == "KNN" and problem_type == "Regression":
model = model_dict["KNNRegressor"]
best_model = self.tune_knn(model, X_train, y_train, model_parameters)
elif model_name == "KNN" and problem_type == "Classification":
model = model_dict["KNNClassifier"]
best_model = self.tune_knn(model, X_train, y_train, model_parameters)
elif model_name == "neural_network" and problem_type == "Regression":
model = model_dict["neural_network"]
best_model = self.tune_neural_network(model, X_train, y_train, model_parameters)
elif model_name == "neural_network" and problem_type == "Classification":
model = model_dict["neural_network"]
best_model = self.tune_neural_network(model, X_train, y_train, model_parameters)
elif model_name == "xg_boost" and problem_type == "Regression":
model = model_dict["XGBoostRegressor"]
best_model = self.tune_xgb(model, X_train, y_train, model_name, model_parameters)
elif model_name == "xg_boost" and problem_type == "Classification":
model = model_dict["XGBoostClassifier"]
best_model = self.tune_xgb(model, X_train, y_train, model_name, model_parameters)
else:
model = model_dict[model_name]
if (model_name == "RandomForestClassifier" or model_name == "RandomForestRegressor"):
best_model = self.tune_random_forest(model, X_train, y_train, model_name, model_parameters)
elif (model_name == "LinearRegression" or model_name == "ElasticNetRegression"):
best_model = self.tune_linear_elasticnet_regression(model, X_train, y_train, model_name, model_parameters)
elif model_name == "LogisticRegression":
best_model = self.tune_logistic_regression(model, X_train, y_train, model_parameters)
elif (model_name == "RidgeRegression" or model_name == "LassoRegression"):
best_model = self.tune_ridge_lasso_regression(model, X_train, y_train, model_name, model_parameters)
elif (model_name == "DecisionTreeRegressor" or model_name == "DecisionTreeClassifier"):
best_model = self.tune_decision_tree(model, X_train, y_train, model_name, model_parameters)
self.best_models[model_name] = best_model
return self.best_models