Spaces:
Build error
Build error
| # -*- coding: utf-8 -*- | |
| """ | |
| Created on Mon Jul 12 10:00:30 2021 | |
| @author: Kishore | |
| """ | |
| ################## Importing Modules ########################################### | |
| from sklearn.metrics import confusion_matrix, classification_report | |
| from sklearn.model_selection import cross_val_score | |
| from sklearn.model_selection import train_test_split | |
| from sklearn.preprocessing import StandardScaler | |
| from sklearn.metrics import mean_squared_error | |
| from sklearn.metrics import r2_score | |
| import math | |
| import pandas as pd | |
| from sklearn.tree import DecisionTreeClassifier | |
| from sklearn.ensemble import RandomForestClassifier | |
| from sklearn.linear_model import LinearRegression | |
| from sklearn.tree import DecisionTreeRegressor | |
| from sklearn.pipeline import Pipeline | |
| from sklearn.ensemble import RandomForestRegressor | |
| from sklearn.svm import SVR | |
| from xgboost import XGBClassifier | |
| from imblearn.over_sampling import SMOTE | |
| from sklearn.metrics import roc_curve, auc,roc_auc_score | |
| import plotly.express as px | |
| import plotly.graph_objects as go | |
| import eli5 | |
| ##################################################################### | |
| ############# Identifying the problem type (Classification/Regression) in Predictive Analytics ########## | |
| def get_problem_type1(clean_data, dependent_variable): | |
| limit_number_of_class=10 | |
| print("problem analysis") | |
| if (clean_data.dtypes[dependent_variable] == 'int32' or clean_data.dtypes[dependent_variable] == 'int64') and (clean_data[dependent_variable].nunique() <= limit_number_of_class): | |
| return "classification" | |
| else: | |
| return "regression" | |
| ######################################################################################################### | |
| ######################### Model Building For Predictive Aanalytics ############################ | |
| def model_build(clean_data, dependent_variable,problem_type,balance_data,steps_dict): | |
| print("Model build started") | |
| print("hi") | |
| d={} | |
| lst=[] | |
| # print(data_dict['path']) | |
| ######## data cleaning########## | |
| train_data = clean_data.drop(dependent_variable, axis=1) | |
| target_data = clean_data[dependent_variable] | |
| if problem_type=="classification": | |
| data_dict = {} | |
| ###### Models #################### | |
| if balance_data=="Auto": | |
| d={} | |
| d["Before Handling Imbalanced Dataset"]=target_data.value_counts() | |
| oversample = SMOTE() | |
| train_data, target_data = oversample.fit_resample(train_data, target_data) | |
| d["After Handling Imbalanced Dataset"] = target_data.value_counts() | |
| data_dict["Handling Imbalanced Dataset"]=d | |
| elif balance_data == "False": | |
| data_dict["Cannot Handle Imbalanced Dataset,It is set to False"] = "" | |
| X_train, X_test, y_train, y_test = train_test_split(train_data,target_data, test_size=0.3, | |
| random_state=0) | |
| # pipeline_lr = Pipeline([('scalar1', StandardScaler()), | |
| # ('lr_classifier', LogisticRegression(random_state=0))]) | |
| pipeline_dt = Pipeline([('scalar2', StandardScaler()), | |
| ('dt_classifier', DecisionTreeClassifier())]) | |
| pipeline_randomforest = Pipeline([('scalar3', StandardScaler()), | |
| ('rf_classifier', RandomForestClassifier())]) | |
| pipeline_xgboost = Pipeline([('scalar4', StandardScaler()), | |
| ('xg_classifier',XGBClassifier() )]) | |
| ############## Lets make the list of pipelines ##################### | |
| pipelines = [pipeline_dt, pipeline_randomforest,pipeline_xgboost] | |
| best_accuracy = 0.0 | |
| best_classifier = 0 | |
| best_pipeline = "" | |
| ################## Dictionary of pipelines and classifier types for ease of reference ############ | |
| pipe_dict = {0: 'Decision_Tree', 1: 'RandomForest',2:'XGBoost_Classifier'} | |
| ########## Fit the pipelines################## | |
| for pipe in pipelines: | |
| pipe.fit(X_train, y_train) | |
| models_info= {} | |
| for i, model in enumerate(pipelines): | |
| val = "{} Test Accuracy: {}".format(pipe_dict[i], model.score(X_test, y_test)) | |
| lst.append(val) | |
| models_info[pipe_dict[i]]= model.score(X_test, y_test) | |
| print("{} Test Accuracy: {}".format(pipe_dict[i], model.score(X_test, y_test))) | |
| df_models_info=pd.DataFrame(models_info.items(),columns=["Models","Accuracy"]) | |
| for i, model in enumerate(pipelines): | |
| if model.score(X_test, y_test) > best_accuracy: | |
| best_accuracy = model.score(X_test, y_test) | |
| best_pipeline = model | |
| best_classifier = i | |
| # print(best_pipeline) | |
| html_object = eli5.show_weights(best_pipeline,feature_names=X_train.columns.tolist()) | |
| result = pd.read_html(html_object.data)[0] | |
| data_dict['Model Interpretation'] = result | |
| val1 = 'Classifier with best accuracy:{}'.format(pipe_dict[best_classifier]) | |
| lst.append(val1) | |
| print('Classifier with best accuracy:{}'.format(pipe_dict[best_classifier])) | |
| y_pred = best_pipeline.predict(X_test) | |
| cn = confusion_matrix(y_test, y_pred) | |
| data_dict['Model details'] = lst | |
| fig = px.histogram(df_models_info, x="Models", y="Accuracy", color="Models") | |
| fig.update_layout(yaxis_title="Accuracy") | |
| data_dict['model_comparison'] = fig | |
| data_dict['Best model']= lst[-1].split(':')[1] | |
| data_dict['Best pipeline'] = best_pipeline | |
| data_dict['Confusion Matrix'] = cn | |
| if len(X_train) <= 100000: | |
| cv = cross_val_score(best_pipeline, X_train, y_train, cv=5, scoring='accuracy') | |
| data_dict['Cross Validation'] = cv | |
| report = classification_report(y_test, y_pred) | |
| data_dict['Classification Report']=report | |
| y_scores = best_pipeline.predict_proba(X_test) | |
| # One hot encode the labels in order to plot them | |
| y_onehot = pd.get_dummies(y_test, columns=best_pipeline.classes_) | |
| # Create an empty figure, and iteratively add new lines | |
| # every time we compute a new class | |
| fig = go.Figure() | |
| fig.add_shape( | |
| type='line', line=dict(dash='dash'), | |
| x0=0, x1=1, y0=0, y1=1 | |
| ) | |
| for i in range(y_scores.shape[1]): | |
| y_true = y_onehot.iloc[:, i] | |
| y_score = y_scores[:, i] | |
| fpr, tpr, _ = roc_curve(y_true, y_score) | |
| auc_score = roc_auc_score(y_true, y_score) | |
| class_name="" | |
| for data1 in steps_dict['categorical_to_numeric']: | |
| for key, value in data1.items(): | |
| col_name = key.split('_encoded')[0] | |
| if col_name == dependent_variable: | |
| # print(col_name) | |
| # print(value) | |
| d = {} | |
| for j, v in value.items(): | |
| if v == y_onehot.columns[i]: | |
| class_name=j | |
| break | |
| name = f"{class_name} (AUC={auc_score:.2f})" | |
| fig.add_trace(go.Scatter(x=fpr, y=tpr, name=name, mode='lines')) | |
| fig.update_layout( | |
| xaxis_title='False Positive Rate', | |
| yaxis_title='True Positive Rate', | |
| yaxis=dict(scaleanchor="x", scaleratio=1), | |
| xaxis=dict(constrain='domain'), | |
| width=700, height=500 | |
| ) | |
| data_dict['ROC Curve'] = fig | |
| print("model completed") | |
| return data_dict | |
| elif problem_type == "regression": | |
| data_dict={} | |
| X_train, X_test, y_train, y_test = train_test_split(train_data, target_data, test_size=0.3,random_state=0) | |
| pipeline_linear = Pipeline([('scalar1', StandardScaler()),('linear_cdt_regressor', LinearRegression())]) | |
| #pipeline_lr = Pipeline([('scalar2', StandardScaler()),('lr_regressor', LogisticRegression())]) | |
| pipeline_dt = Pipeline([('scalar2', StandardScaler()),('dt_regressor', DecisionTreeRegressor())]) | |
| pipeline_randomforest = Pipeline([('scalar3', StandardScaler()),('rf_regressor', RandomForestRegressor())]) | |
| pipeline_svm = Pipeline([('scalar4', StandardScaler()), ('svr',SVR(kernel='linear'))]) | |
| pipeline_regression = [pipeline_linear,pipeline_dt,pipeline_randomforest,pipeline_svm] | |
| best_accuracy = 0.0 | |
| best_regressor = 0 | |
| best_pipeline = "" | |
| ################## Dictionary of pipelines and classifier types for ease of reference ############ | |
| # pipe_dict = {0: 'Linear_Regression', 1: 'Logistic_Regression', 2: 'Decision_Tree', 3: 'RandomForest',4:'SVM'} | |
| pipe_dict = {0: 'Linear_Regression', 1: 'Decision_Tree', 2: 'RandomForest', 3: 'SVM'} | |
| for pipe in pipeline_regression: | |
| pipe.fit(X_train, y_train) | |
| models_info = {} | |
| for i, model in enumerate(pipeline_regression): | |
| val = "{} Test Accuracy: {}".format(pipe_dict[i], model.score(X_test, y_test)) | |
| lst.append(val) | |
| models_info[pipe_dict[i]] = model.score(X_test, y_test) | |
| print("{} Test Accuracy: {}".format(pipe_dict[i], model.score(X_test, y_test))) | |
| df_models_info = pd.DataFrame(models_info.items(), columns=["Models", "Accuracy"]) | |
| for i, model in enumerate(pipeline_regression): | |
| if model.score(X_test, y_test) > best_accuracy: | |
| best_accuracy = model.score(X_test, y_test) | |
| best_pipeline = model | |
| best_regressor = i | |
| # print(best_pipeline) | |
| html_object = eli5.show_weights(best_pipeline, feature_names=X_train.columns.tolist()) | |
| result = pd.read_html(html_object.data)[0] | |
| data_dict['Model Interpretation'] = result | |
| val1='Regressor with best accuracy:{}'.format(pipe_dict[best_regressor]) | |
| lst.append(val1) | |
| print('Regressor with best accuracy:{}'.format(pipe_dict[best_regressor])) | |
| data_dict['Model details'] = lst | |
| fig = px.histogram(df_models_info, x="Models", y="Accuracy", color="Models") | |
| fig.update_layout(yaxis_title="Accuracy") | |
| data_dict['model_comparison'] = fig | |
| data_dict['Best model'] = lst[-1].split(':')[1] | |
| data_dict['Best pipeline'] = best_pipeline | |
| y_pred = best_pipeline.predict(X_test) | |
| # print(y_pred) | |
| mse = mean_squared_error(y_test, y_pred) | |
| # print(mse) | |
| rmse = math.sqrt(mse) | |
| # print(rmse) | |
| r2 = r2_score(y_test, y_pred) | |
| statement_mse = "MEAN SQUARED ERROR : " + str(mse) | |
| statement_rmse = "ROOT MEAN SQUARED ERROR : " + str(rmse) | |
| statement_r2 = "R2 Score : " + str(r2) | |
| data_dict['MEAN SQUARED ERROR']=statement_mse | |
| data_dict['ROOT MEAN SQUARED ERROR']=statement_rmse | |
| data_dict['R2 Score']=statement_r2 | |
| cv = cross_val_score(best_pipeline, X_train, y_train, cv=5) | |
| data_dict['Cross Validation']=cv | |
| fig = go.Figure([ | |
| go.Scatter(y=y_test, name='Actual', mode='markers'), | |
| go.Scatter(y=y_pred, name='Predicted', mode='markers') | |
| ]) | |
| fig.update_layout( | |
| title=str(lst[-1].split(':')[1]), | |
| xaxis_title="Count", | |
| yaxis_title="Target values") | |
| # plt.show() | |
| data_dict['Regression graph']=fig | |
| return data_dict | |
| else: | |
| return d | |
| ############################################################################################### | |