Spaces:

links-ads
/

HYPE_Churn_Analysis

Paused

cmmedoro

Upload demo code

f8da90e 3 months ago

6.63 kB

	import pandas as pd
	from preprocessing import Preprocessor
	import xgboost
	import matplotlib.pyplot as plt
	import shap
	import random
	import numpy as np

	def obtain_scaler_and_label_enc():
	dataset_url = "data/train.csv"
	target_column = "Exited"
	target_column_ttc = ["Exited", "Tenure"]
	preprocessor = Preprocessor(dataset_url, target_column, target_column_ttc, resampling="under", scaling='minmax')

	X_train, _, _, _, _, _, _, _ = preprocessor.process_cp()

	scaler = preprocessor.scaler
	label_encoders = preprocessor.label_encoders
	return scaler, label_encoders, X_train.columns

	def scale_dataset(test_df, target_column, train_cols, scaler):
	X_test = test_df.drop(target_column, axis = 1)
	X_test_df_ordered = X_test[train_cols]
	X_test_scaled = scaler.transform(X_test_df_ordered)
	X_test_scaled_df = pd.DataFrame(X_test_scaled, columns = X_test_df_ordered.columns, index = X_test_df_ordered.index)
	test_df_scaled = pd.concat([X_test_scaled_df, test_df[target_column]], axis = 1)
	return X_test_df_ordered, test_df_scaled

	def obtain_explanations(booster, val_pd_X):
	explainer = shap.TreeExplainer(booster, feature_perturbation = "interventional" , model_output="probability", data=val_pd_X)

	shap_values = explainer.shap_values(val_pd_X)
	df_temp = pd.DataFrame(shap_values)
	df_temp['index'] = val_pd_X.index
	df_temp.set_index('index', inplace=True)
	shap_values_df = df_temp

	explaination=explainer(val_pd_X)
	return explainer, shap_values, shap_values_df, explaination


	def plot_feature_impact(shap_values, features, feature_names, max_display=10, plot_size=(12,6)):
	fig = plt.figure(figsize=plot_size)

	shap.summary_plot(shap_values=shap_values,
	features=features,
	feature_names=feature_names,
	max_display=max_display,
	plot_size=plot_size,
	show=False)
	plt.title('Feature Impact plot', fontsize=16)
	plt.xlabel('Impact on Model Output (Negative: Non-Churner, Positive: Churner)', fontsize=14)
	plt.ylabel('Features', fontsize=14)
	plt.xticks(fontsize=20)
	plt.yticks(fontsize=20)
	plt.subplots_adjust(left=0.2, right=0.95, top=0.95, bottom=0.1)
	plt.savefig("img/feature_impact_plot.png", dpi=300, bbox_inches='tight', pad_inches=0.3)
	plt.show()


	def obtain_predictions(booster, val_pd_X, val_pd_y):
	customers_dmatrix = xgboost.DMatrix(
	val_pd_X.values,
	feature_names=val_pd_X.columns.tolist()
	)

	customers_prediction = booster.predict(customers_dmatrix)
	customers_probability = 1 / (1 + np.exp(-customers_prediction))

	new_y = pd.DataFrame(val_pd_y)
	new_y['predicted_probability'] = customers_probability
	new_y['predicted_prediction'] = customers_prediction

	churners_and_non = new_y
	churners_and_non_X = val_pd_X[val_pd_X.index.isin(churners_and_non.index.tolist())]

	return churners_and_non, churners_and_non_X, new_y


	def extract_customer(test, test_pd_X_tmp, test_pd_y_tmp, test_unscaled_pd):
	cust_indices = test_pd_y_tmp.index
	customer_idx = random.choice(cust_indices.tolist())
	print("Customer Index:", customer_idx)
	customer_pos=test_pd_X_tmp.index.get_loc(customer_idx)
	test_pd_X_tmp=test_pd_X_tmp.reset_index()
	test_pd_y_tmp=test_pd_y_tmp.reset_index()
	customer_x = test_pd_X_tmp[customer_pos:customer_pos+1]
	customer_y = test_pd_y_tmp[customer_pos:customer_pos+1]
	customer_x_original = test_unscaled_pd.loc[customer_idx]
	customer_record = test[test.index == customer_idx]

	return customer_pos,customer_idx,customer_x.set_index('index'), customer_y.set_index('index'), customer_x_original,customer_record

	def plot_waterfall(customer_shap_values, explainer_expected_value, customer_x, customer_x_original, customer_prediction, actual_churn, customer_idx, max_display=10):
	fig = plt.figure(figsize=(20, 15))

	shap_values = shap.Explanation(
	values=customer_shap_values[0],
	base_values=explainer_expected_value,
	data=customer_x,
	feature_names=customer_x.columns.tolist(),
	display_data=customer_x_original[customer_x.columns.tolist()]
	)

	shap.plots.waterfall(shap_values, show=False, max_display=max_display)

	plt.title(f"Feature Impact Analysis for Customer {customer_idx}", fontsize=13)

	x_min, x_max = plt.xlim()
	y_min, y_max = plt.ylim()

	plt.tight_layout()
	#plt.subplots_adjust(left=0.2, right=0.95, top=0.95, bottom=0.1)
	#plt.savefig(f"img/customer_{customer_idx}_waterfall.png", dpi=300, bbox_inches='tight', pad_inches=0.3)
	#plt.show()
	return fig




	if __name__ == "__main__":
	print("Download the dataset")
	hundred_churners_val = pd.read_csv('data/hundred_val_churners.csv', index_col=0)
	hundred_non_churners_val = pd.read_csv('data/hundred_val_non_churners.csv', index_col=0)
	val_df = pd.concat([hundred_churners_val, hundred_non_churners_val], axis=0)

	print("Scale the dataset")
	scaler, label_encs, train_cols = obtain_scaler_and_label_enc()
	val_ordered_df, val_scaled_df = scale_dataset(val_df, "Exited", train_cols, scaler)

	xgb_model = xgboost.XGBClassifier()

	xgb_model.load_model("models/xgb_churn_model.json")
	booster = xgb_model.get_booster()

	val_unscaled_pd = val_ordered_df
	val2 = pd.DataFrame(val_scaled_df, columns=val_ordered_df.columns, index = val_ordered_df.index)
	val_pd_X = val2
	val_pd_y = val_df['Exited']

	explainer, shap_values, shap_values_df, explaination = obtain_explanations(booster, val_pd_X)
	plot_feature_impact(shap_values, val_pd_X, val_pd_X.columns.tolist())

	churners_and_non, churners_and_non_X, new_y = obtain_predictions(booster, val_pd_X, val_pd_y)

	sample_size=30
	customer_pos, customer_idx, customer_x, customer_y, customer_x_original,customer_record = extract_customer(val2, churners_and_non_X.sample(sample_size,random_state=42), churners_and_non.sample(sample_size,random_state=42), val_unscaled_pd)


	print(f"\nChurn prediction probability: {customer_y.predicted_prediction.values[0]}")
	print(f"Actual churn: {customer_y.Exited.values[0]}")

	customer_shap_values = np.array(shap_values_df[shap_values_df.index == customer_idx])

	fig = plot_waterfall(customer_shap_values, explainer.expected_value, customer_x, customer_x_original, new_y[new_y.index == customer_idx].predicted_prediction, 'Yes', customer_idx)
	plt.savefig(f"img/customer_{customer_idx}_waterfall.png", dpi=300, bbox_inches='tight', pad_inches=0.3)
	plt.show()