| import pandas as pd |
| from preprocessing import Preprocessor |
| import xgboost |
| import matplotlib.pyplot as plt |
| import shap |
| import random |
| import numpy as np |
|
|
| def obtain_scaler_and_label_enc(): |
| dataset_url = "data/train.csv" |
| target_column = "Exited" |
| target_column_ttc = ["Exited", "Tenure"] |
| preprocessor = Preprocessor(dataset_url, target_column, target_column_ttc, resampling="under", scaling='minmax') |
|
|
| X_train, _, _, _, _, _, _, _ = preprocessor.process_cp() |
|
|
| scaler = preprocessor.scaler |
| label_encoders = preprocessor.label_encoders |
| return scaler, label_encoders, X_train.columns |
|
|
| def scale_dataset(test_df, target_column, train_cols, scaler): |
| X_test = test_df.drop(target_column, axis = 1) |
| X_test_df_ordered = X_test[train_cols] |
| X_test_scaled = scaler.transform(X_test_df_ordered) |
| X_test_scaled_df = pd.DataFrame(X_test_scaled, columns = X_test_df_ordered.columns, index = X_test_df_ordered.index) |
| test_df_scaled = pd.concat([X_test_scaled_df, test_df[target_column]], axis = 1) |
| return X_test_df_ordered, test_df_scaled |
|
|
| def obtain_explanations(booster, val_pd_X): |
| explainer = shap.TreeExplainer(booster, feature_perturbation = "interventional" , model_output="probability", data=val_pd_X) |
|
|
| shap_values = explainer.shap_values(val_pd_X) |
| df_temp = pd.DataFrame(shap_values) |
| df_temp['index'] = val_pd_X.index |
| df_temp.set_index('index', inplace=True) |
| shap_values_df = df_temp |
|
|
| explaination=explainer(val_pd_X) |
| return explainer, shap_values, shap_values_df, explaination |
|
|
|
|
| def plot_feature_impact(shap_values, features, feature_names, max_display=10, plot_size=(12,6)): |
| fig = plt.figure(figsize=plot_size) |
|
|
| shap.summary_plot(shap_values=shap_values, |
| features=features, |
| feature_names=feature_names, |
| max_display=max_display, |
| plot_size=plot_size, |
| show=False) |
| plt.title('Feature Impact plot', fontsize=16) |
| plt.xlabel('Impact on Model Output (Negative: Non-Churner, Positive: Churner)', fontsize=14) |
| plt.ylabel('Features', fontsize=14) |
| plt.xticks(fontsize=20) |
| plt.yticks(fontsize=20) |
| plt.subplots_adjust(left=0.2, right=0.95, top=0.95, bottom=0.1) |
| plt.savefig("img/feature_impact_plot.png", dpi=300, bbox_inches='tight', pad_inches=0.3) |
| plt.show() |
| |
|
|
| def obtain_predictions(booster, val_pd_X, val_pd_y): |
| customers_dmatrix = xgboost.DMatrix( |
| val_pd_X.values, |
| feature_names=val_pd_X.columns.tolist() |
| ) |
|
|
| customers_prediction = booster.predict(customers_dmatrix) |
| customers_probability = 1 / (1 + np.exp(-customers_prediction)) |
|
|
| new_y = pd.DataFrame(val_pd_y) |
| new_y['predicted_probability'] = customers_probability |
| new_y['predicted_prediction'] = customers_prediction |
|
|
| churners_and_non = new_y |
| churners_and_non_X = val_pd_X[val_pd_X.index.isin(churners_and_non.index.tolist())] |
|
|
| return churners_and_non, churners_and_non_X, new_y |
|
|
|
|
| def extract_customer(test, test_pd_X_tmp, test_pd_y_tmp, test_unscaled_pd): |
| cust_indices = test_pd_y_tmp.index |
| customer_idx = random.choice(cust_indices.tolist()) |
| print("Customer Index:", customer_idx) |
| customer_pos=test_pd_X_tmp.index.get_loc(customer_idx) |
| test_pd_X_tmp=test_pd_X_tmp.reset_index() |
| test_pd_y_tmp=test_pd_y_tmp.reset_index() |
| customer_x = test_pd_X_tmp[customer_pos:customer_pos+1] |
| customer_y = test_pd_y_tmp[customer_pos:customer_pos+1] |
| customer_x_original = test_unscaled_pd.loc[customer_idx] |
| customer_record = test[test.index == customer_idx] |
|
|
| return customer_pos,customer_idx,customer_x.set_index('index'), customer_y.set_index('index'), customer_x_original,customer_record |
|
|
| def plot_waterfall(customer_shap_values, explainer_expected_value, customer_x, customer_x_original, customer_prediction, actual_churn, customer_idx, max_display=10): |
| fig = plt.figure(figsize=(20, 15)) |
|
|
| shap_values = shap.Explanation( |
| values=customer_shap_values[0], |
| base_values=explainer_expected_value, |
| data=customer_x, |
| feature_names=customer_x.columns.tolist(), |
| display_data=customer_x_original[customer_x.columns.tolist()] |
| ) |
|
|
| shap.plots.waterfall(shap_values, show=False, max_display=max_display) |
|
|
| plt.title(f"Feature Impact Analysis for Customer {customer_idx}", fontsize=13) |
|
|
| x_min, x_max = plt.xlim() |
| y_min, y_max = plt.ylim() |
|
|
| plt.tight_layout() |
| |
| |
| |
| return fig |
| |
|
|
|
|
|
|
| if __name__ == "__main__": |
| print("Download the dataset") |
| hundred_churners_val = pd.read_csv('data/hundred_val_churners.csv', index_col=0) |
| hundred_non_churners_val = pd.read_csv('data/hundred_val_non_churners.csv', index_col=0) |
| val_df = pd.concat([hundred_churners_val, hundred_non_churners_val], axis=0) |
|
|
| print("Scale the dataset") |
| scaler, label_encs, train_cols = obtain_scaler_and_label_enc() |
| val_ordered_df, val_scaled_df = scale_dataset(val_df, "Exited", train_cols, scaler) |
|
|
| xgb_model = xgboost.XGBClassifier() |
|
|
| xgb_model.load_model("models/xgb_churn_model.json") |
| booster = xgb_model.get_booster() |
|
|
| val_unscaled_pd = val_ordered_df |
| val2 = pd.DataFrame(val_scaled_df, columns=val_ordered_df.columns, index = val_ordered_df.index) |
| val_pd_X = val2 |
| val_pd_y = val_df['Exited'] |
|
|
| explainer, shap_values, shap_values_df, explaination = obtain_explanations(booster, val_pd_X) |
| plot_feature_impact(shap_values, val_pd_X, val_pd_X.columns.tolist()) |
|
|
| churners_and_non, churners_and_non_X, new_y = obtain_predictions(booster, val_pd_X, val_pd_y) |
|
|
| sample_size=30 |
| customer_pos, customer_idx, customer_x, customer_y, customer_x_original,customer_record = extract_customer(val2, churners_and_non_X.sample(sample_size,random_state=42), churners_and_non.sample(sample_size,random_state=42), val_unscaled_pd) |
| |
|
|
| print(f"\nChurn prediction probability: {customer_y.predicted_prediction.values[0]}") |
| print(f"Actual churn: {customer_y.Exited.values[0]}") |
|
|
| customer_shap_values = np.array(shap_values_df[shap_values_df.index == customer_idx]) |
|
|
| fig = plot_waterfall(customer_shap_values, explainer.expected_value, customer_x, customer_x_original, new_y[new_y.index == customer_idx].predicted_prediction, 'Yes', customer_idx) |
| plt.savefig(f"img/customer_{customer_idx}_waterfall.png", dpi=300, bbox_inches='tight', pad_inches=0.3) |
| plt.show() |