Spaces:
Build error
Build error
| # Contains classes and functions for model | |
| # building, hyperparameter tuning, and training models. | |
| import numpy as np | |
| from sklearn.model_selection import GridSearchCV | |
| from joblib import dump # For saving models | |
| from src.config import model_dict | |
| import streamlit as st | |
| class ModelTrainer: | |
| def __init__(self, json_content: dict): | |
| self.json_content = json_content | |
| self.k_fold = json_content["design_state_data"]["train"]["k_fold"] | |
| if not self.k_fold: | |
| self.k_fold = None | |
| self.random_state = [42] | |
| self.num_iter = 3 | |
| def tune_random_forest(self, model, X_train, y_train, model_name, model_parameters): | |
| params = {"random_state": self.random_state} | |
| min_trees = model_parameters[model_name]["min_trees"] | |
| max_trees = model_parameters[model_name]["max_trees"] | |
| params["n_estimators"] = np.linspace(min_trees, max_trees, self.num_iter, dtype=int) | |
| min_depth = model_parameters[model_name]["min_depth"] | |
| max_depth = model_parameters[model_name]["max_depth"] | |
| params["max_depth"] = np.linspace(min_depth, max_depth, self.num_iter, dtype=int) | |
| min_samples_per_leaf = model_parameters[model_name]["min_samples_per_leaf_min_value"] | |
| max_samples_per_leaf = model_parameters[model_name]["min_samples_per_leaf_max_value"] | |
| params["min_samples_leaf"] = np.linspace(min_samples_per_leaf, max_samples_per_leaf, self.num_iter, dtype=int) | |
| if model_parameters[model_name].get("random_state"): | |
| params["random_state"] = model_parameters[model_name]["random_state"] | |
| gcv = GridSearchCV(model, params, cv=self.k_fold) | |
| gcv.fit(X_train, y_train) | |
| return gcv.best_estimator_ | |
| def tune_linear_elasticnet_regression(self, model, X_train, y_train, model_name, model_parameters): | |
| params = {"random_state": self.random_state} | |
| if model_parameters[model_name].get("random_state"): | |
| params["random_state"] = model_parameters[model_name]["random_state"] | |
| min_iter = model_parameters[model_name]["min_iter"] | |
| max_iter = model_parameters[model_name]["max_iter"] | |
| params["max_iter"] = np.linspace(min_iter, max_iter, self.num_iter, dtype=int) | |
| min_reg = model_parameters[model_name]["min_regparam"] | |
| max_reg = model_parameters[model_name]["max_regparam"] | |
| params["alpha"] = np.logspace(min_reg, max_reg, self.num_iter) | |
| min_elasticnet = model_parameters[model_name]["min_elasticnet"] | |
| max_elasticnet = model_parameters[model_name]["max_elasticnet"] | |
| params["l1_ratio"] = np.linspace(min_elasticnet, max_elasticnet, self.num_iter) | |
| gcv = GridSearchCV(model, params, cv=self.k_fold) | |
| gcv.fit(X_train, y_train) | |
| return gcv.best_estimator_ | |
| def tune_logistic_regression(self, model, X_train, y_train, model_parameters): | |
| params = {"random_state": self.random_state} | |
| if model_parameters["LogisticRegression"].get("random_state"): | |
| params["random_state"] = model_parameters["LogisticRegression"]["random_state"] | |
| min_iter = model_parameters["LogisticRegression"]["min_iter"] | |
| max_iter = model_parameters["LogisticRegression"]["max_iter"] | |
| params["max_iter"] = np.linspace(min_iter, max_iter, self.num_iter, dtype=int) | |
| min_reg = model_parameters["LogisticRegression"]["min_regparam"] | |
| max_reg = model_parameters["LogisticRegression"]["max_regparam"] | |
| params["C"] = np.logspace(min_reg, max_reg, self.num_iter) | |
| min_elasticnet = model_parameters["LogisticRegression"]["min_elasticnet"] | |
| max_elasticnet = model_parameters["LogisticRegression"]["max_elasticnet"] | |
| params["l1_ratio"] = np.linspace(min_elasticnet, max_elasticnet, self.num_iter) | |
| gcv = GridSearchCV(model, params, cv=self.k_fold) | |
| gcv.fit(X_train, y_train) | |
| return gcv.best_estimator_ | |
| def tune_ridge_lasso_regression(self, model, X_train, y_train, model_name, model_parameters): | |
| params = {"random_state": self.random_state} | |
| if model_parameters[model_name].get("random_state"): | |
| params["random_state"] = model_parameters[model_name]["random_state"] | |
| min_iter = model_parameters[model_name]["min_iter"] | |
| max_iter = model_parameters[model_name]["max_iter"] | |
| params["max_iter"] = np.linspace(min_iter, max_iter, self.num_iter, dtype=int) | |
| min_regparam = model_parameters[model_name]["min_regparam"] | |
| max_regparam = model_parameters[model_name]["max_regparam"] | |
| params["alpha"] = np.logspace(min_regparam, max_regparam, self.num_iter) | |
| gcv = GridSearchCV(model, params, cv=self.k_fold) | |
| gcv.fit(X_train, y_train) | |
| return gcv.best_estimator_ | |
| def tune_decision_tree(self, model, X_train, y_train, model_name, model_parameters): | |
| params = {"random_state": self.random_state} | |
| if model_parameters[model_name].get("random_state"): | |
| params["random_state"] = model_parameters[model_name]["random_state"] | |
| min_depth = model_parameters[model_name]["min_depth"] | |
| max_depth = model_parameters[model_name]["max_depth"] | |
| params["max_depth"] = np.linspace(min_depth, max_depth, self.num_iter, dtype=int) | |
| criterion = [] | |
| if model_parameters[model_name]["use_gini"]: | |
| criterion.append("gini") | |
| if model_parameters[model_name]["use_entropy"]: | |
| criterion.append("entropy") | |
| params["criterion"] = criterion | |
| splitter = [] | |
| if model_parameters[model_name]["use_random"]: | |
| splitter.append("random") | |
| if model_parameters[model_name]["use_best"]: | |
| splitter.append("best") | |
| params["splitter"] = splitter | |
| if model_parameters[model_name].get("min_samples_per_leaf"): | |
| params["min_samples_leaf"] = model_parameters[model_name]["min_samples_per_leaf"] | |
| gcv = GridSearchCV(model, params, cv=self.k_fold) | |
| gcv.fit(X_train, y_train) | |
| return gcv.best_estimator_ | |
| def tune_svm(self, model, X_train, y_train, model_parameters): | |
| params = {} | |
| kernel = [] | |
| if model_parameters["SVM"]["linear_kernel"]: | |
| kernel.append("linear") | |
| if model_parameters["SVM"]["rep_kernel"]: | |
| kernel.append("rbf") | |
| if model_parameters["SVM"]["polynomial_kernel"]: | |
| kernel.append("poly") | |
| if model_parameters["SVM"]["sigmoid_kernel"]: | |
| kernel.append("sigmoid") | |
| params["kernel"] = kernel | |
| params["C"] = model_parameters["SVM"]["c_value"] | |
| gamma = [] | |
| if model_parameters["SVM"]["scale"]: | |
| gamma.append("scale") | |
| if model_parameters["SVM"]["auto"]: | |
| gamma.append("auto") | |
| params["gamma"] = gamma | |
| params["max_iter"] = model_parameters["SVM"]["max_iterations"] | |
| params["tol"] = model_parameters["SVM"]["tolerance"] | |
| gcv = GridSearchCV(model, params, cv=self.k_fold) | |
| gcv.fit(X_train, y_train) | |
| return gcv.best_estimator_ | |
| def tune_knn(self, model, X_train, y_train, model_parameters): | |
| params = {} | |
| params["n_neighbors"] = model_parameters["KNN"]["k_value"] | |
| if model_parameters["KNN"].get("distance_weighting"): | |
| params["weights"] = ["distance"] | |
| if model_parameters["KNN"]["neighbour_finding_algorithm"] == "Automatic": | |
| params["algorithm"] = "auto" | |
| gcv = GridSearchCV(model, params, cv=self.k_fold) | |
| gcv.fit(X_train, y_train) | |
| return gcv.best_estimator_ | |
| pass | |
| def tune_neural_network(self, model, X_train, y_train, model_parameters): | |
| parameters = model_parameters["neural_network"] | |
| params = {"random_state": self.random_state, | |
| "hidden_layer_sizes": parameters["hidden_layer_sizes"], | |
| "alpha": parameters["alpha_value"], | |
| "max_iter": parameters["max_iterations"], | |
| "tol": parameters["convergence_tolerance"], | |
| "early_stopping": parameters["early_stopping"], | |
| "solver": parameters["solver"], | |
| "shuffle": parameters["shuffle_data"], | |
| "learning_rate_init": parameters["initial_learning_rate"], | |
| "batch_size": parameters["automatic_batching"], | |
| "beta_1": parameters["beta_1"], | |
| "beta_2": parameters["beta_2"], | |
| "epsilon": parameters["epsilon"], | |
| "power_t": parameters["power_t"], | |
| "momentum": parameters["momentum"], | |
| "nesterovs_momentum": parameters["use_nesterov_momentum"], | |
| } | |
| if parameters.get("random_state"): | |
| params["random_state"] = parameters["random_state"] | |
| if parameters.get("activation"): | |
| params["activation"] = parameters["activation"] | |
| gcv = GridSearchCV(model, params, cv=self.k_fold) | |
| gcv.fit(X_train, y_train) | |
| return gcv.best_estimator_ | |
| def tune_xgb(self, model, X_train, y_train, model_name, model_parameters): | |
| params = {"random_state": self.random_state, | |
| "booster": [] | |
| } | |
| if model_parameters["xg_boost"].get("random_state"): | |
| params["random_state"] = model_parameters["xg_boost"]["random_state"] | |
| if model_parameters["xg_boost"].get("use_gradient_boosted_tree"): | |
| params["booster"].append("gbtree") | |
| if model_parameters["xg_boost"].get("dart"): | |
| params["booster"].append("dart") | |
| params["n_estimators"] = model_parameters["xg_boost"]["max_num_of_trees"] | |
| params["tree_method"] = model_parameters["xg_boost"]["tree_method"] | |
| if model_parameters["xg_boost"]["early_stopping"]: | |
| params["early_stopping_rounds"] = model_parameters["xg_boost"]["early_stopping_rounds"] | |
| params["max_depth"] = model_parameters["xg_boost"]["max_depth_of_tree"] | |
| params["learning_rate"] = model_parameters["xg_boost"]["learningRate"] | |
| params["l1_regularization"] = model_parameters["xg_boost"]["l1_regularization"] | |
| params["l2_regularization"] = model_parameters["xg_boost"]["l2_regularization"] | |
| params["min_child_weight"] = model_parameters["xg_boost"]["min_child_weight"] | |
| params["gamma"] = model_parameters["xg_boost"]["gamma"] | |
| params["sub_sample"] = model_parameters["xg_boost"]["sub_sample"] | |
| params["col_sample_by_tree"] = model_parameters["xg_boost"]["col_sample_by_tree"] | |
| gcv = GridSearchCV(model, params, cv=self.k_fold) | |
| gcv.fit(X_train, y_train) | |
| return gcv.best_estimator_ | |
| def build_and_tune_model(self, X_train, y_train, problem_type, selected_models, model_parameters): | |
| self.best_models = {} | |
| for model_name in selected_models: | |
| if model_name == "xg_boost": | |
| st.warning("As of now xg_boost is not supported") | |
| continue | |
| if model_name == "SVM" and problem_type == "Regression": | |
| model = model_dict["SVMRegressor"] | |
| best_model = self.tune_svm(model, X_train, y_train, model_parameters) | |
| elif model_name == "SVM" and problem_type == "Classification": | |
| model = model_dict["SVMClassifier"] | |
| best_model = self.tune_svm(model, X_train, y_train, model_parameters) | |
| elif model_name == "KNN" and problem_type == "Regression": | |
| model = model_dict["KNNRegressor"] | |
| best_model = self.tune_knn(model, X_train, y_train, model_parameters) | |
| elif model_name == "KNN" and problem_type == "Classification": | |
| model = model_dict["KNNClassifier"] | |
| best_model = self.tune_knn(model, X_train, y_train, model_parameters) | |
| elif model_name == "neural_network" and problem_type == "Regression": | |
| model = model_dict["neural_network"] | |
| best_model = self.tune_neural_network(model, X_train, y_train, model_parameters) | |
| elif model_name == "neural_network" and problem_type == "Classification": | |
| model = model_dict["neural_network"] | |
| best_model = self.tune_neural_network(model, X_train, y_train, model_parameters) | |
| elif model_name == "xg_boost" and problem_type == "Regression": | |
| model = model_dict["XGBoostRegressor"] | |
| best_model = self.tune_xgb(model, X_train, y_train, model_name, model_parameters) | |
| elif model_name == "xg_boost" and problem_type == "Classification": | |
| model = model_dict["XGBoostClassifier"] | |
| best_model = self.tune_xgb(model, X_train, y_train, model_name, model_parameters) | |
| else: | |
| model = model_dict[model_name] | |
| if (model_name == "RandomForestClassifier" or model_name == "RandomForestRegressor"): | |
| best_model = self.tune_random_forest(model, X_train, y_train, model_name, model_parameters) | |
| elif (model_name == "LinearRegression" or model_name == "ElasticNetRegression"): | |
| best_model = self.tune_linear_elasticnet_regression(model, X_train, y_train, model_name, model_parameters) | |
| elif model_name == "LogisticRegression": | |
| best_model = self.tune_logistic_regression(model, X_train, y_train, model_parameters) | |
| elif (model_name == "RidgeRegression" or model_name == "LassoRegression"): | |
| best_model = self.tune_ridge_lasso_regression(model, X_train, y_train, model_name, model_parameters) | |
| elif (model_name == "DecisionTreeRegressor" or model_name == "DecisionTreeClassifier"): | |
| best_model = self.tune_decision_tree(model, X_train, y_train, model_name, model_parameters) | |
| self.best_models[model_name] = best_model | |
| return self.best_models | |