Sandeep S commited on
Commit ·
32bec34
1
Parent(s): 13231d6
Added multivariate analysis graphs
Browse files- graphs/Churn.png +0 -0
- graphs/Contract.png +0 -0
- graphs/Correlation.png +0 -0
- graphs/Dependents.png +0 -0
- graphs/DeviceProtection.png +0 -0
- graphs/MonthlyCharges.png +0 -0
- graphs/OnlineBackup.png +0 -0
- graphs/OnlineSecurity.png +0 -0
- graphs/PaperlessBilling.png +0 -0
- graphs/Partner.png +0 -0
- graphs/PaymentMethod.png +0 -0
- graphs/SeniorCitizen.png +0 -0
- graphs/TechSupport.png +0 -0
- graphs/tenure.png +0 -0
- main.py +31 -2
graphs/Churn.png
ADDED
|
graphs/Contract.png
ADDED
|
graphs/Correlation.png
ADDED
|
graphs/Dependents.png
ADDED
|
graphs/DeviceProtection.png
ADDED
|
graphs/MonthlyCharges.png
ADDED
|
graphs/OnlineBackup.png
ADDED
|
graphs/OnlineSecurity.png
ADDED
|
graphs/PaperlessBilling.png
ADDED
|
graphs/Partner.png
ADDED
|
graphs/PaymentMethod.png
ADDED
|
graphs/SeniorCitizen.png
ADDED
|
graphs/TechSupport.png
ADDED
|
graphs/tenure.png
ADDED
|
main.py
CHANGED
|
@@ -1,12 +1,41 @@
|
|
| 1 |
import pandas as pd
|
| 2 |
import matplotlib.pyplot as plt
|
| 3 |
-
|
|
|
|
|
|
|
|
|
|
| 4 |
from sklearn.model_selection import train_test_split
|
| 5 |
|
| 6 |
dataset = pd.read_csv("data.csv")
|
| 7 |
|
|
|
|
|
|
|
| 8 |
dataset.drop(columns=['customerID'], inplace=True)
|
| 9 |
|
|
|
|
| 10 |
encoder = LabelEncoder()
|
|
|
|
|
|
|
|
|
|
| 11 |
for column in dataset.select_dtypes(include=['object']).columns:
|
| 12 |
-
dataset[column] = encoder.fit_transform(dataset[column])
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
import pandas as pd
|
| 2 |
import matplotlib.pyplot as plt
|
| 3 |
+
import seaborn as sns
|
| 4 |
+
from sklearn.linear_model import LogisticRegression
|
| 5 |
+
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, roc_auc_score
|
| 6 |
+
from sklearn.preprocessing import LabelEncoder, StandardScaler
|
| 7 |
from sklearn.model_selection import train_test_split
|
| 8 |
|
| 9 |
dataset = pd.read_csv("data.csv")
|
| 10 |
|
| 11 |
+
# Dropping customerID which is unique for each customer
|
| 12 |
+
# and does not provide any useful information for prediction
|
| 13 |
dataset.drop(columns=['customerID'], inplace=True)
|
| 14 |
|
| 15 |
+
# Encoding categorical variables and Scaling numerical variables
|
| 16 |
encoder = LabelEncoder()
|
| 17 |
+
scaler = StandardScaler()
|
| 18 |
+
for column in dataset.select_dtypes(include=['int64', 'float64']).columns:
|
| 19 |
+
dataset[column] = scaler.fit_transform(dataset[column].values.reshape(-1, 1))
|
| 20 |
for column in dataset.select_dtypes(include=['object']).columns:
|
| 21 |
+
dataset[column] = encoder.fit_transform(dataset[column])
|
| 22 |
+
|
| 23 |
+
# Plotting the correlation to find the most important features
|
| 24 |
+
fig, ax = plt.subplots(figsize=(16, 10))
|
| 25 |
+
corr=dataset.corr()["Churn"]
|
| 26 |
+
ax.set_xticklabels(corr.index, rotation=45, ha='right', fontsize=10)
|
| 27 |
+
|
| 28 |
+
sns.barplot(x=corr.index, y=corr.values, ax=ax)
|
| 29 |
+
plt.savefig("graphs/Correlation.png")
|
| 30 |
+
|
| 31 |
+
# Based on the correlation plot, the following features are removed
|
| 32 |
+
dataset.drop(columns=['gender', 'PhoneService', 'MultipleLines', 'InternetService', 'StreamingTV', 'StreamingMovies', 'TotalCharges'], inplace=True)
|
| 33 |
+
# Multivariate analysis
|
| 34 |
+
fig, ax = plt.subplots(figsize=(16, 10))
|
| 35 |
+
sns.heatmap(dataset.corr(), annot=True, fmt=".2f", cmap='coolwarm', ax=ax)
|
| 36 |
+
plt.savefig("graphs/Heatmap.png")
|
| 37 |
+
|
| 38 |
+
for column in dataset.columns:
|
| 39 |
+
fig, ax = plt.subplots(figsize=(16, 10))
|
| 40 |
+
sns.kdeplot(dataset[column],ax=ax)
|
| 41 |
+
plt.savefig(f"graphs/{column}.png")
|