Spaces:

AICOE-Datamatics
/

AiNext

Build error

App Files Files Community

AiNext / model_pipeline_steps.py

AICOE-Datamatics

Initial code

247c8df about 2 years ago

raw

history blame contribute delete

11.6 kB

	# -- coding: utf-8 --
	"""
	Created on Mon Jul 12 10:00:30 2021

	@author: Kishore
	"""
	################## Importing Modules ###########################################
	from sklearn.metrics import confusion_matrix, classification_report
	from sklearn.model_selection import cross_val_score
	from sklearn.model_selection import train_test_split
	from sklearn.preprocessing import StandardScaler
	from sklearn.metrics import mean_squared_error
	from sklearn.metrics import r2_score
	import math
	import pandas as pd
	from sklearn.tree import DecisionTreeClassifier
	from sklearn.ensemble import RandomForestClassifier
	from sklearn.linear_model import LinearRegression
	from sklearn.tree import DecisionTreeRegressor
	from sklearn.pipeline import Pipeline
	from sklearn.ensemble import RandomForestRegressor
	from sklearn.svm import SVR
	from xgboost import XGBClassifier
	from imblearn.over_sampling import SMOTE
	from sklearn.metrics import roc_curve, auc,roc_auc_score
	import plotly.express as px
	import plotly.graph_objects as go
	import eli5
	#####################################################################



	############# Identifying the problem type (Classification/Regression) in Predictive Analytics ##########
	def get_problem_type1(clean_data, dependent_variable):
	limit_number_of_class=10
	print("problem analysis")
	if (clean_data.dtypes[dependent_variable] == 'int32' or clean_data.dtypes[dependent_variable] == 'int64') and (clean_data[dependent_variable].nunique() <= limit_number_of_class):
	return "classification"
	else:
	return "regression"
	#########################################################################################################


	######################### Model Building For Predictive Aanalytics ############################
	def model_build(clean_data, dependent_variable,problem_type,balance_data,steps_dict):
	print("Model build started")
	print("hi")
	d={}

	lst=[]
	# print(data_dict['path'])


	######## data cleaning##########
	train_data = clean_data.drop(dependent_variable, axis=1)
	target_data = clean_data[dependent_variable]

	if problem_type=="classification":
	data_dict = {}
	###### Models ####################
	if balance_data=="Auto":
	d={}
	d["Before Handling Imbalanced Dataset"]=target_data.value_counts()
	oversample = SMOTE()
	train_data, target_data = oversample.fit_resample(train_data, target_data)
	d["After Handling Imbalanced Dataset"] = target_data.value_counts()
	data_dict["Handling Imbalanced Dataset"]=d

	elif balance_data == "False":
	data_dict["Cannot Handle Imbalanced Dataset,It is set to False"] = ""

	X_train, X_test, y_train, y_test = train_test_split(train_data,target_data, test_size=0.3,
	random_state=0)

	# pipeline_lr = Pipeline([('scalar1', StandardScaler()),
	# ('lr_classifier', LogisticRegression(random_state=0))])

	pipeline_dt = Pipeline([('scalar2', StandardScaler()),
	('dt_classifier', DecisionTreeClassifier())])

	pipeline_randomforest = Pipeline([('scalar3', StandardScaler()),
	('rf_classifier', RandomForestClassifier())])
	pipeline_xgboost = Pipeline([('scalar4', StandardScaler()),
	('xg_classifier',XGBClassifier() )])

	############## Lets make the list of pipelines #####################
	pipelines = [pipeline_dt, pipeline_randomforest,pipeline_xgboost]

	best_accuracy = 0.0
	best_classifier = 0
	best_pipeline = ""

	################## Dictionary of pipelines and classifier types for ease of reference ############
	pipe_dict = {0: 'Decision_Tree', 1: 'RandomForest',2:'XGBoost_Classifier'}

	########## Fit the pipelines##################
	for pipe in pipelines:
	pipe.fit(X_train, y_train)

	models_info= {}
	for i, model in enumerate(pipelines):
	val = "{} Test Accuracy: {}".format(pipe_dict[i], model.score(X_test, y_test))
	lst.append(val)
	models_info[pipe_dict[i]]= model.score(X_test, y_test)
	print("{} Test Accuracy: {}".format(pipe_dict[i], model.score(X_test, y_test)))
	df_models_info=pd.DataFrame(models_info.items(),columns=["Models","Accuracy"])

	for i, model in enumerate(pipelines):
	if model.score(X_test, y_test) > best_accuracy:
	best_accuracy = model.score(X_test, y_test)
	best_pipeline = model
	best_classifier = i
	# print(best_pipeline)

	html_object = eli5.show_weights(best_pipeline,feature_names=X_train.columns.tolist())
	result = pd.read_html(html_object.data)[0]
	data_dict['Model Interpretation'] = result

	val1 = 'Classifier with best accuracy:{}'.format(pipe_dict[best_classifier])
	lst.append(val1)
	print('Classifier with best accuracy:{}'.format(pipe_dict[best_classifier]))

	y_pred = best_pipeline.predict(X_test)

	cn = confusion_matrix(y_test, y_pred)

	data_dict['Model details'] = lst
	fig = px.histogram(df_models_info, x="Models", y="Accuracy", color="Models")
	fig.update_layout(yaxis_title="Accuracy")
	data_dict['model_comparison'] = fig

	data_dict['Best model']= lst[-1].split(':')[1]
	data_dict['Best pipeline'] = best_pipeline
	data_dict['Confusion Matrix'] = cn



	if len(X_train) <= 100000:
	cv = cross_val_score(best_pipeline, X_train, y_train, cv=5, scoring='accuracy')
	data_dict['Cross Validation'] = cv
	report = classification_report(y_test, y_pred)
	data_dict['Classification Report']=report

	y_scores = best_pipeline.predict_proba(X_test)

	# One hot encode the labels in order to plot them
	y_onehot = pd.get_dummies(y_test, columns=best_pipeline.classes_)

	# Create an empty figure, and iteratively add new lines
	# every time we compute a new class
	fig = go.Figure()
	fig.add_shape(
	type='line', line=dict(dash='dash'),
	x0=0, x1=1, y0=0, y1=1
	)

	for i in range(y_scores.shape[1]):
	y_true = y_onehot.iloc[:, i]
	y_score = y_scores[:, i]

	fpr, tpr, _ = roc_curve(y_true, y_score)
	auc_score = roc_auc_score(y_true, y_score)

	class_name=""
	for data1 in steps_dict['categorical_to_numeric']:
	for key, value in data1.items():
	col_name = key.split('_encoded')[0]
	if col_name == dependent_variable:
	# print(col_name)
	# print(value)
	d = {}
	for j, v in value.items():
	if v == y_onehot.columns[i]:
	class_name=j
	break

	name = f"{class_name} (AUC={auc_score:.2f})"
	fig.add_trace(go.Scatter(x=fpr, y=tpr, name=name, mode='lines'))

	fig.update_layout(
	xaxis_title='False Positive Rate',
	yaxis_title='True Positive Rate',
	yaxis=dict(scaleanchor="x", scaleratio=1),
	xaxis=dict(constrain='domain'),
	width=700, height=500
	)
	data_dict['ROC Curve'] = fig
	print("model completed")

	return data_dict


	elif problem_type == "regression":
	data_dict={}
	X_train, X_test, y_train, y_test = train_test_split(train_data, target_data, test_size=0.3,random_state=0)
	pipeline_linear = Pipeline([('scalar1', StandardScaler()),('linear_cdt_regressor', LinearRegression())])
	#pipeline_lr = Pipeline([('scalar2', StandardScaler()),('lr_regressor', LogisticRegression())])
	pipeline_dt = Pipeline([('scalar2', StandardScaler()),('dt_regressor', DecisionTreeRegressor())])

	pipeline_randomforest = Pipeline([('scalar3', StandardScaler()),('rf_regressor', RandomForestRegressor())])
	pipeline_svm = Pipeline([('scalar4', StandardScaler()), ('svr',SVR(kernel='linear'))])


	pipeline_regression = [pipeline_linear,pipeline_dt,pipeline_randomforest,pipeline_svm]

	best_accuracy = 0.0
	best_regressor = 0
	best_pipeline = ""

	################## Dictionary of pipelines and classifier types for ease of reference ############
	# pipe_dict = {0: 'Linear_Regression', 1: 'Logistic_Regression', 2: 'Decision_Tree', 3: 'RandomForest',4:'SVM'}
	pipe_dict = {0: 'Linear_Regression', 1: 'Decision_Tree', 2: 'RandomForest', 3: 'SVM'}

	for pipe in pipeline_regression:
	pipe.fit(X_train, y_train)

	models_info = {}
	for i, model in enumerate(pipeline_regression):
	val = "{} Test Accuracy: {}".format(pipe_dict[i], model.score(X_test, y_test))
	lst.append(val)
	models_info[pipe_dict[i]] = model.score(X_test, y_test)
	print("{} Test Accuracy: {}".format(pipe_dict[i], model.score(X_test, y_test)))
	df_models_info = pd.DataFrame(models_info.items(), columns=["Models", "Accuracy"])

	for i, model in enumerate(pipeline_regression):
	if model.score(X_test, y_test) > best_accuracy:
	best_accuracy = model.score(X_test, y_test)
	best_pipeline = model
	best_regressor = i
	# print(best_pipeline)

	html_object = eli5.show_weights(best_pipeline, feature_names=X_train.columns.tolist())
	result = pd.read_html(html_object.data)[0]
	data_dict['Model Interpretation'] = result

	val1='Regressor with best accuracy:{}'.format(pipe_dict[best_regressor])
	lst.append(val1)
	print('Regressor with best accuracy:{}'.format(pipe_dict[best_regressor]))
	data_dict['Model details'] = lst
	fig = px.histogram(df_models_info, x="Models", y="Accuracy", color="Models")
	fig.update_layout(yaxis_title="Accuracy")
	data_dict['model_comparison'] = fig
	data_dict['Best model'] = lst[-1].split(':')[1]
	data_dict['Best pipeline'] = best_pipeline
	y_pred = best_pipeline.predict(X_test)
	# print(y_pred)
	mse = mean_squared_error(y_test, y_pred)
	# print(mse)

	rmse = math.sqrt(mse)
	# print(rmse)
	r2 = r2_score(y_test, y_pred)
	statement_mse = "MEAN SQUARED ERROR : " + str(mse)
	statement_rmse = "ROOT MEAN SQUARED ERROR : " + str(rmse)
	statement_r2 = "R2 Score : " + str(r2)
	data_dict['MEAN SQUARED ERROR']=statement_mse
	data_dict['ROOT MEAN SQUARED ERROR']=statement_rmse
	data_dict['R2 Score']=statement_r2
	cv = cross_val_score(best_pipeline, X_train, y_train, cv=5)
	data_dict['Cross Validation']=cv

	fig = go.Figure([
	go.Scatter(y=y_test, name='Actual', mode='markers'),
	go.Scatter(y=y_pred, name='Predicted', mode='markers')
	])
	fig.update_layout(
	title=str(lst[-1].split(':')[1]),
	xaxis_title="Count",
	yaxis_title="Target values")

	# plt.show()
	data_dict['Regression graph']=fig
	return data_dict


	else:
	return d

	###############################################################################################