File size: 4,898 Bytes
8f39cdb
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import VotingClassifier
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
import pickle
#models
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
import xgboost as xgb
#Improving accuracy
from imblearn.over_sampling import SMOTE

df = pd.read_csv('churn.csv')

sns.set_style(style="whitegrid")
plt.figure(figsize=(12, 10))

#sns.countplot(x='Exited', data=df)
plt.title('Churn Distribution')
#sns.histplot(data=df, x='Age', kde=True)
plt.title('Age Distribution')

#sns.scatterplot(data=df, x='CreditScore', y='Age', hue='Exited')
plt.title('Credit Score vs Age')

#sns.boxplot(data=df, x='Exited', y='Balance')
plt.title('Balance vs Churn')

#sns.boxplot(x='Exited', y='CreditScore', data=df)
plt.title('Credit Score vs Churn')
#plt.show()

#Feature Engineering
features = df.drop(columns=['Exited', 'RowNumber', 'CustomerId', 'Surname'])
features["CLV"] = df["Balance"] * df["EstimatedSalary"] / 100000
features["AgeGroup"] = pd.cut(df["Age"], bins=[0, 30, 45, 60, 100], labels=["Young", "MiddleAged", "Senior", "Elderly"])
features["TenureAgeRatio"] = df["Tenure"] / df["Age"]
features = pd.get_dummies(features, columns=['Geography', 'Gender', 'AgeGroup'])
target = df['Exited']

#Train Test Split
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.fit_transform(X_test)

#SMOTE
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_train, y_train)

#Logistic Regression
lr_model = LogisticRegression(random_state=42)
lr_model.fit(X_train, y_train)

lr_pred = lr_model.predict(X_test)

lr_accuracy = accuracy_score(y_test, lr_pred)

#Model Evaluation and Saving
def evaluate_model(model, X_train, y_train, X_test, y_test):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    print(f"{model.__class__.__name__} Accuracy: {accuracy}")
    print(f"\nClassification Report:\n{classification_report(y_test, y_pred)}")
    print(f"--------------------------------")


def evaluate_and_save_model(model, X_train, y_train, X_test, y_test, file_name):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    print(f"{model.__class__.__name__} Accuracy: {accuracy}")
    print(f"\nClassification Report:\n{classification_report(y_test, y_pred)}")
    print(f"--------------------------------")

    with open(file_name, 'wb') as file:
        pickle.dump(model, file)
    
    print(f"Model saved to {file_name}")
""" 
xgb_model = xgb.XGBClassifier(random_state=42)
#evaluate_and_save_model(xgb_model, X_train, y_train, X_test, y_test, 'xgb_model.pkl')
evaluate_model(xgb_model, X_train, y_train, X_test, y_test)
evaluate_and_save_model(xgb_model, X_resampled, y_resampled, X_test, y_test, 'xgb_model_resampled.pkl')

dt_model = DecisionTreeClassifier(random_state=42)
#evaluate_and_save_model(dt_model, X_train, y_train, X_test, y_test, 'dt_model.pkl')
evaluate_model(dt_model, X_train, y_train, X_test, y_test)

rf_model = RandomForestClassifier(random_state=42)
evaluate_and_save_model(rf_model, X_train, y_train, X_test, y_test, 'rf_model.pkl')

nb_model = GaussianNB()
evaluate_and_save_model(nb_model, X_train, y_train, X_test, y_test, 'nb_model.pkl')

svm_model = SVC(random_state=42)
evaluate_and_save_model(svm_model, X_train, y_train, X_test, y_test, 'svm_model.pkl')

knn_model = KNeighborsClassifier()
evaluate_and_save_model(knn_model, X_train, y_train, X_test, y_test, 'knn_model.pkl')

#Feature Importance
feature_imporance = xgb_model.feature_importances_
feature_names = features.columns

feature_importance_df = pd.DataFrame({
    'Feature': feature_names, 'Importance': feature_imporance
})

feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)

 """
#Voting Classifier
""" 
voting_model = VotingClassifier(
    estimators=[('xgb', xgb.XGBClassifier(random_state=42)), ('rf', RandomForestClassifier(random_state=42)), ('svm', SVC(random_state=42, probability=True))], 
    voting='hard'
)
evaluate_and_save_model(voting_model, X_train, y_train, X_test, y_test, 'voting_model.pkl') """
""" 
plt.figure(figsize=(10, 6))
plt.barh(feature_importance_df['Feature'], feature_importance_df['Importance'])
plt.xticks(rotation=90)
plt.xlabel('Importance')
plt.ylabel('Feature')
plt.title('Feature Importance') """