Spaces:
Sleeping
Sleeping
File size: 4,898 Bytes
8f39cdb |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 |
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import VotingClassifier
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
import pickle
#models
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
import xgboost as xgb
#Improving accuracy
from imblearn.over_sampling import SMOTE
df = pd.read_csv('churn.csv')
sns.set_style(style="whitegrid")
plt.figure(figsize=(12, 10))
#sns.countplot(x='Exited', data=df)
plt.title('Churn Distribution')
#sns.histplot(data=df, x='Age', kde=True)
plt.title('Age Distribution')
#sns.scatterplot(data=df, x='CreditScore', y='Age', hue='Exited')
plt.title('Credit Score vs Age')
#sns.boxplot(data=df, x='Exited', y='Balance')
plt.title('Balance vs Churn')
#sns.boxplot(x='Exited', y='CreditScore', data=df)
plt.title('Credit Score vs Churn')
#plt.show()
#Feature Engineering
features = df.drop(columns=['Exited', 'RowNumber', 'CustomerId', 'Surname'])
features["CLV"] = df["Balance"] * df["EstimatedSalary"] / 100000
features["AgeGroup"] = pd.cut(df["Age"], bins=[0, 30, 45, 60, 100], labels=["Young", "MiddleAged", "Senior", "Elderly"])
features["TenureAgeRatio"] = df["Tenure"] / df["Age"]
features = pd.get_dummies(features, columns=['Geography', 'Gender', 'AgeGroup'])
target = df['Exited']
#Train Test Split
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.fit_transform(X_test)
#SMOTE
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_train, y_train)
#Logistic Regression
lr_model = LogisticRegression(random_state=42)
lr_model.fit(X_train, y_train)
lr_pred = lr_model.predict(X_test)
lr_accuracy = accuracy_score(y_test, lr_pred)
#Model Evaluation and Saving
def evaluate_model(model, X_train, y_train, X_test, y_test):
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"{model.__class__.__name__} Accuracy: {accuracy}")
print(f"\nClassification Report:\n{classification_report(y_test, y_pred)}")
print(f"--------------------------------")
def evaluate_and_save_model(model, X_train, y_train, X_test, y_test, file_name):
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"{model.__class__.__name__} Accuracy: {accuracy}")
print(f"\nClassification Report:\n{classification_report(y_test, y_pred)}")
print(f"--------------------------------")
with open(file_name, 'wb') as file:
pickle.dump(model, file)
print(f"Model saved to {file_name}")
"""
xgb_model = xgb.XGBClassifier(random_state=42)
#evaluate_and_save_model(xgb_model, X_train, y_train, X_test, y_test, 'xgb_model.pkl')
evaluate_model(xgb_model, X_train, y_train, X_test, y_test)
evaluate_and_save_model(xgb_model, X_resampled, y_resampled, X_test, y_test, 'xgb_model_resampled.pkl')
dt_model = DecisionTreeClassifier(random_state=42)
#evaluate_and_save_model(dt_model, X_train, y_train, X_test, y_test, 'dt_model.pkl')
evaluate_model(dt_model, X_train, y_train, X_test, y_test)
rf_model = RandomForestClassifier(random_state=42)
evaluate_and_save_model(rf_model, X_train, y_train, X_test, y_test, 'rf_model.pkl')
nb_model = GaussianNB()
evaluate_and_save_model(nb_model, X_train, y_train, X_test, y_test, 'nb_model.pkl')
svm_model = SVC(random_state=42)
evaluate_and_save_model(svm_model, X_train, y_train, X_test, y_test, 'svm_model.pkl')
knn_model = KNeighborsClassifier()
evaluate_and_save_model(knn_model, X_train, y_train, X_test, y_test, 'knn_model.pkl')
#Feature Importance
feature_imporance = xgb_model.feature_importances_
feature_names = features.columns
feature_importance_df = pd.DataFrame({
'Feature': feature_names, 'Importance': feature_imporance
})
feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)
"""
#Voting Classifier
"""
voting_model = VotingClassifier(
estimators=[('xgb', xgb.XGBClassifier(random_state=42)), ('rf', RandomForestClassifier(random_state=42)), ('svm', SVC(random_state=42, probability=True))],
voting='hard'
)
evaluate_and_save_model(voting_model, X_train, y_train, X_test, y_test, 'voting_model.pkl') """
"""
plt.figure(figsize=(10, 6))
plt.barh(feature_importance_df['Feature'], feature_importance_df['Importance'])
plt.xticks(rotation=90)
plt.xlabel('Importance')
plt.ylabel('Feature')
plt.title('Feature Importance') """
|