cmmedoro
Upload demo code
f8da90e
import pandas as pd
from preprocessing import Preprocessor
import xgboost
import matplotlib.pyplot as plt
import shap
import random
import numpy as np
def obtain_scaler_and_label_enc():
dataset_url = "data/train.csv"
target_column = "Exited"
target_column_ttc = ["Exited", "Tenure"]
preprocessor = Preprocessor(dataset_url, target_column, target_column_ttc, resampling="under", scaling='minmax')
X_train, _, _, _, _, _, _, _ = preprocessor.process_cp()
scaler = preprocessor.scaler
label_encoders = preprocessor.label_encoders
return scaler, label_encoders, X_train.columns
def scale_dataset(test_df, target_column, train_cols, scaler):
X_test = test_df.drop(target_column, axis = 1)
X_test_df_ordered = X_test[train_cols]
X_test_scaled = scaler.transform(X_test_df_ordered)
X_test_scaled_df = pd.DataFrame(X_test_scaled, columns = X_test_df_ordered.columns, index = X_test_df_ordered.index)
test_df_scaled = pd.concat([X_test_scaled_df, test_df[target_column]], axis = 1)
return X_test_df_ordered, test_df_scaled
def obtain_explanations(booster, val_pd_X):
explainer = shap.TreeExplainer(booster, feature_perturbation = "interventional" , model_output="probability", data=val_pd_X)
shap_values = explainer.shap_values(val_pd_X)
df_temp = pd.DataFrame(shap_values)
df_temp['index'] = val_pd_X.index
df_temp.set_index('index', inplace=True)
shap_values_df = df_temp
explaination=explainer(val_pd_X)
return explainer, shap_values, shap_values_df, explaination
def plot_feature_impact(shap_values, features, feature_names, max_display=10, plot_size=(12,6)):
fig = plt.figure(figsize=plot_size)
shap.summary_plot(shap_values=shap_values,
features=features,
feature_names=feature_names,
max_display=max_display,
plot_size=plot_size,
show=False)
plt.title('Feature Impact plot', fontsize=16)
plt.xlabel('Impact on Model Output (Negative: Non-Churner, Positive: Churner)', fontsize=14)
plt.ylabel('Features', fontsize=14)
plt.xticks(fontsize=20)
plt.yticks(fontsize=20)
plt.subplots_adjust(left=0.2, right=0.95, top=0.95, bottom=0.1)
plt.savefig("img/feature_impact_plot.png", dpi=300, bbox_inches='tight', pad_inches=0.3)
plt.show()
def obtain_predictions(booster, val_pd_X, val_pd_y):
customers_dmatrix = xgboost.DMatrix(
val_pd_X.values,
feature_names=val_pd_X.columns.tolist()
)
customers_prediction = booster.predict(customers_dmatrix)
customers_probability = 1 / (1 + np.exp(-customers_prediction))
new_y = pd.DataFrame(val_pd_y)
new_y['predicted_probability'] = customers_probability
new_y['predicted_prediction'] = customers_prediction
churners_and_non = new_y
churners_and_non_X = val_pd_X[val_pd_X.index.isin(churners_and_non.index.tolist())]
return churners_and_non, churners_and_non_X, new_y
def extract_customer(test, test_pd_X_tmp, test_pd_y_tmp, test_unscaled_pd):
cust_indices = test_pd_y_tmp.index
customer_idx = random.choice(cust_indices.tolist())
print("Customer Index:", customer_idx)
customer_pos=test_pd_X_tmp.index.get_loc(customer_idx)
test_pd_X_tmp=test_pd_X_tmp.reset_index()
test_pd_y_tmp=test_pd_y_tmp.reset_index()
customer_x = test_pd_X_tmp[customer_pos:customer_pos+1]
customer_y = test_pd_y_tmp[customer_pos:customer_pos+1]
customer_x_original = test_unscaled_pd.loc[customer_idx]
customer_record = test[test.index == customer_idx]
return customer_pos,customer_idx,customer_x.set_index('index'), customer_y.set_index('index'), customer_x_original,customer_record
def plot_waterfall(customer_shap_values, explainer_expected_value, customer_x, customer_x_original, customer_prediction, actual_churn, customer_idx, max_display=10):
fig = plt.figure(figsize=(20, 15))
shap_values = shap.Explanation(
values=customer_shap_values[0],
base_values=explainer_expected_value,
data=customer_x,
feature_names=customer_x.columns.tolist(),
display_data=customer_x_original[customer_x.columns.tolist()]
)
shap.plots.waterfall(shap_values, show=False, max_display=max_display)
plt.title(f"Feature Impact Analysis for Customer {customer_idx}", fontsize=13)
x_min, x_max = plt.xlim()
y_min, y_max = plt.ylim()
plt.tight_layout()
#plt.subplots_adjust(left=0.2, right=0.95, top=0.95, bottom=0.1)
#plt.savefig(f"img/customer_{customer_idx}_waterfall.png", dpi=300, bbox_inches='tight', pad_inches=0.3)
#plt.show()
return fig
if __name__ == "__main__":
print("Download the dataset")
hundred_churners_val = pd.read_csv('data/hundred_val_churners.csv', index_col=0)
hundred_non_churners_val = pd.read_csv('data/hundred_val_non_churners.csv', index_col=0)
val_df = pd.concat([hundred_churners_val, hundred_non_churners_val], axis=0)
print("Scale the dataset")
scaler, label_encs, train_cols = obtain_scaler_and_label_enc()
val_ordered_df, val_scaled_df = scale_dataset(val_df, "Exited", train_cols, scaler)
xgb_model = xgboost.XGBClassifier()
xgb_model.load_model("models/xgb_churn_model.json")
booster = xgb_model.get_booster()
val_unscaled_pd = val_ordered_df
val2 = pd.DataFrame(val_scaled_df, columns=val_ordered_df.columns, index = val_ordered_df.index)
val_pd_X = val2
val_pd_y = val_df['Exited']
explainer, shap_values, shap_values_df, explaination = obtain_explanations(booster, val_pd_X)
plot_feature_impact(shap_values, val_pd_X, val_pd_X.columns.tolist())
churners_and_non, churners_and_non_X, new_y = obtain_predictions(booster, val_pd_X, val_pd_y)
sample_size=30
customer_pos, customer_idx, customer_x, customer_y, customer_x_original,customer_record = extract_customer(val2, churners_and_non_X.sample(sample_size,random_state=42), churners_and_non.sample(sample_size,random_state=42), val_unscaled_pd)
print(f"\nChurn prediction probability: {customer_y.predicted_prediction.values[0]}")
print(f"Actual churn: {customer_y.Exited.values[0]}")
customer_shap_values = np.array(shap_values_df[shap_values_df.index == customer_idx])
fig = plot_waterfall(customer_shap_values, explainer.expected_value, customer_x, customer_x_original, new_y[new_y.index == customer_idx].predicted_prediction, 'Yes', customer_idx)
plt.savefig(f"img/customer_{customer_idx}_waterfall.png", dpi=300, bbox_inches='tight', pad_inches=0.3)
plt.show()