Spaces:
Build error
Build error
| import time | |
| from typing import Text | |
| import lightgbm as lgb | |
| import matplotlib.pyplot as plt | |
| import numpy as np | |
| import xgboost as xgb | |
| from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier | |
| from sklearn.linear_model import LogisticRegression | |
| from sklearn.metrics import ( | |
| accuracy_score, | |
| roc_auc_score, | |
| cohen_kappa_score, | |
| plot_confusion_matrix, | |
| roc_curve, | |
| classification_report, | |
| ) | |
| from sklearn.model_selection import GridSearchCV | |
| from sklearn.neural_network import MLPClassifier | |
| from sklearn.tree import DecisionTreeClassifier | |
| from configs.constants import SUPPORT_MODEL, DEFAULT_MODEL | |
| from ml.data_prepare import data_preparing, create_dataset | |
| def plot_roc_cur(fper, tper): | |
| """ | |
| PLot the ROC | |
| :param fper: | |
| :param tper: | |
| """ | |
| plt.plot(fper, tper, color="orange", label="ROC") | |
| plt.plot([0, 1], [0, 1], color="darkblue", linestyle="--") | |
| plt.xlabel("False Positive Rate") | |
| plt.ylabel("True Positive Rate") | |
| plt.title("Receiver Operating Characteristic (ROC) Curve") | |
| plt.legend() | |
| plt.show() | |
| class MLModel: | |
| """ | |
| WC predictor model | |
| """ | |
| def __init__(self, model_type: Text): | |
| assert ( | |
| model_type in SUPPORT_MODEL | |
| ), "Not support the kind of model. Please choose one of {}".format( | |
| SUPPORT_MODEL | |
| ) | |
| self.model_type = model_type | |
| if self.model_type == "LogisticRegression": | |
| self.model = self.get_logistic_regression_model() | |
| elif self.model_type == "DecisionTreeClassifier": | |
| self.model = self.get_decision_tree_model() | |
| elif self.model_type == "MLPClassifier": | |
| self.model = self.get_neural_network_model() | |
| elif self.model_type == "RandomForestClassifier": | |
| self.model = self.get_random_forest_model() | |
| elif self.model_type == "GradientBoostingClassifier": | |
| self.model = self.get_gradient_boosting_model() | |
| elif self.model_type == "LGBMClassifier": | |
| self.model = self.get_light_gbm_model() | |
| elif self.model_type == "XGBClassifier": | |
| self.model = self.get_xgboost_model() | |
| def predict_proba(self, x): | |
| """ | |
| Call predict_proba on the estimator with the best found parameters. | |
| :return: | |
| """ | |
| return self.model.predict_proba(x) | |
| def __run_model(model, x_train, y_train, x_test, y_test, verbose=True): | |
| t0 = time.time() | |
| if verbose is False: | |
| model.fit(x_train.values, np.ravel(y_train), verbose=0) | |
| else: | |
| model.fit(x_train.values, np.ravel(y_train)) | |
| model = model.best_estimator_ | |
| y_pred = model.predict(x_test) | |
| accuracy = accuracy_score(y_test.values, y_pred) | |
| roc_auc = roc_auc_score(y_test, model.predict_proba(x_test.values)[:, 1]) | |
| coh_kap = cohen_kappa_score(y_test, y_pred) | |
| time_taken = time.time() - t0 | |
| print("Accuracy : {}".format(accuracy)) | |
| print("ROC Area under Curve : {}".format(roc_auc)) | |
| print("Cohen's Kappa : {}".format(coh_kap)) | |
| print("Time taken : {}".format(time_taken)) | |
| print(classification_report(y_test, y_pred, digits=5)) | |
| return model, accuracy, roc_auc, coh_kap, time_taken | |
| def get_logistic_regression_model(**params_lr): | |
| """ | |
| Return a logistic regression model | |
| :return: | |
| """ | |
| if not all(params_lr.values()): | |
| params_lr = { | |
| "C": np.logspace(-3, 3, 7), | |
| "penalty": ["l1", "l2"], | |
| "solver": "liblinear", | |
| } | |
| model_lr = LogisticRegression() | |
| model_lr = GridSearchCV( | |
| model_lr, params_lr, cv=3, verbose=False, scoring="roc_auc", refit=True | |
| ) | |
| return model_lr | |
| def get_decision_tree_model(**params): | |
| """ | |
| Return a decision tree model | |
| :return: | |
| """ | |
| if not all(params.values()): | |
| params = { | |
| "max_features": ["auto", "sqrt", "log2"], | |
| "ccp_alpha": [0.1, 0.01, 0.001], | |
| "max_depth": [5, 6, 7, 8, 9], | |
| "criterion": ["gini", "entropy"], | |
| } | |
| model = DecisionTreeClassifier() | |
| model = GridSearchCV( | |
| estimator=model, | |
| param_grid=params, | |
| cv=3, | |
| verbose=False, | |
| scoring="roc_auc", | |
| refit=True, | |
| ) | |
| return model | |
| def get_neural_network_model(**params_nn): | |
| """ | |
| Return a neutral network model | |
| :return: | |
| """ | |
| if not all(params_nn.values()): | |
| params_nn = { | |
| "solver": ["lbfgs"], | |
| "max_iter": [ | |
| 1000, | |
| 1100, | |
| 1200, | |
| 1300, | |
| 1400, | |
| 1500, | |
| 1600, | |
| 1700, | |
| 1800, | |
| 1900, | |
| 2000, | |
| ], | |
| "alpha": 10.0 ** -np.arange(1, 10), | |
| "hidden_layer_sizes": np.arange(10, 15), | |
| "random_state": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9], | |
| } | |
| model_nn = MLPClassifier() | |
| model_nn = GridSearchCV( | |
| model_nn, params_nn, n_jobs=-1, scoring="roc_auc", refit=True, verbose=False | |
| ) | |
| return model_nn | |
| def get_random_forest_model(**params_rf): | |
| """ | |
| Return a random forest model | |
| :return: | |
| """ | |
| if not all(params_rf.values()): | |
| params_rf = { | |
| "max_depth": [20], | |
| "min_samples_split": [10], | |
| "max_leaf_nodes": [175], | |
| "min_samples_leaf": [5], | |
| "n_estimators": [250], | |
| "max_features": ["sqrt"], | |
| } | |
| model_rf = RandomForestClassifier() | |
| model_rf = GridSearchCV( | |
| model_rf, | |
| params_rf, | |
| cv=3, | |
| n_jobs=-1, | |
| verbose=False, | |
| scoring="roc_auc", | |
| refit=True, | |
| ) | |
| return model_rf | |
| def get_light_gbm_model(**params_lgb): | |
| """ | |
| Return a LightGBM model | |
| :return: | |
| """ | |
| if not all(params_lgb.values()): | |
| params_lgb = { | |
| "learning_rate": [0.005, 0.01], | |
| "n_estimators": [8, 16, 24], | |
| "num_leaves": [ | |
| 6, | |
| 8, | |
| 12, | |
| 16, | |
| ], # large num_leaves helps improve accuracy but might lead to over-fitting | |
| "boosting_type": ["gbdt", "dart"], # for better accuracy -> try dart | |
| "objective": ["binary"], | |
| "max_bin": [ | |
| 255, | |
| 510, | |
| ], # large max_bin helps improve accuracy but might slow down training progress | |
| "random_state": [500], | |
| "colsample_bytree": [0.64, 0.65, 0.66], | |
| "subsample": [0.7, 0.75], | |
| "reg_alpha": [1, 1.2], | |
| "reg_lambda": [1, 1.2, 1.4], | |
| } | |
| model = lgb.LGBMClassifier() | |
| model = GridSearchCV( | |
| model, | |
| params_lgb, | |
| verbose=False, | |
| cv=3, | |
| n_jobs=-1, | |
| scoring="roc_auc", | |
| refit=True, | |
| ) | |
| return model | |
| def get_xgboost_model(**params_xgb): | |
| """ | |
| Return a xgboost model | |
| :return: | |
| """ | |
| if not all(params_xgb.values()): | |
| params_xgb = { | |
| "nthread": [4], # when use hyper thread, xgboost may become slower | |
| "objective": ["binary:logistic"], | |
| "learning_rate": [0.05], # so called `eta` value | |
| "max_depth": [6], | |
| "min_child_weight": [11], | |
| "silent": [1], | |
| "subsample": [0.8], | |
| "colsample_bytree": [0.7], | |
| "n_estimators": [ | |
| 100 | |
| ], # number of trees, change it to 1000 for better results | |
| "missing": [-999], | |
| "seed": [1337], | |
| } | |
| model = GridSearchCV( | |
| xgb.XGBClassifier(), | |
| params_xgb, | |
| n_jobs=-1, | |
| cv=3, | |
| scoring="roc_auc", | |
| refit=True, | |
| ) | |
| return model | |
| def fit_and_eval_model(self, x_train, x_test, y_train, y_test): | |
| """ | |
| Run the model with dataset | |
| :param x_train: | |
| :param x_test: | |
| :param y_train: | |
| :param y_test: | |
| :return: | |
| """ | |
| model_lr, accuracy_lr, roc_auc_lr, coh_kap_lr, tt_lr = self.__run_model( | |
| self.model, x_train, y_train, x_test, y_test | |
| ) | |
| return model_lr, accuracy_lr, roc_auc_lr, coh_kap_lr, tt_lr | |
| def get_gradient_boosting_model(**params): | |
| """ | |
| Return gradient boosting model | |
| :param params: | |
| :return: | |
| """ | |
| if not all(params.values()): | |
| params = { | |
| "learning_rate": [0.01, 0.02, 0.03], | |
| "min_samples_split": [5, 10], | |
| "min_samples_leaf": [3, 5], | |
| "max_depth": [3, 5, 10], | |
| "max_features": ["sqrt"], | |
| "n_estimators": [100, 200], | |
| } | |
| model = GradientBoostingClassifier(random_state=100) | |
| return GridSearchCV(model, params, cv=3, n_jobs=-1) | |
| base_df, data_df = data_preparing() | |
| x_train, x_test, y_train, y_test = create_dataset(data_df) | |
| ml_model = MLModel(DEFAULT_MODEL) | |
| ml_model.fit_and_eval_model(x_train, x_test, y_train, y_test) | |