Sandeep S commited on
Commit
32bec34
·
1 Parent(s): 13231d6

Added multivariate analysis graphs

Browse files
graphs/Churn.png ADDED
graphs/Contract.png ADDED
graphs/Correlation.png ADDED
graphs/Dependents.png ADDED
graphs/DeviceProtection.png ADDED
graphs/MonthlyCharges.png ADDED
graphs/OnlineBackup.png ADDED
graphs/OnlineSecurity.png ADDED
graphs/PaperlessBilling.png ADDED
graphs/Partner.png ADDED
graphs/PaymentMethod.png ADDED
graphs/SeniorCitizen.png ADDED
graphs/TechSupport.png ADDED
graphs/tenure.png ADDED
main.py CHANGED
@@ -1,12 +1,41 @@
1
  import pandas as pd
2
  import matplotlib.pyplot as plt
3
- from sklearn.preprocessing import LabelEncoder
 
 
 
4
  from sklearn.model_selection import train_test_split
5
 
6
  dataset = pd.read_csv("data.csv")
7
 
 
 
8
  dataset.drop(columns=['customerID'], inplace=True)
9
 
 
10
  encoder = LabelEncoder()
 
 
 
11
  for column in dataset.select_dtypes(include=['object']).columns:
12
- dataset[column] = encoder.fit_transform(dataset[column])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import pandas as pd
2
  import matplotlib.pyplot as plt
3
+ import seaborn as sns
4
+ from sklearn.linear_model import LogisticRegression
5
+ from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, roc_auc_score
6
+ from sklearn.preprocessing import LabelEncoder, StandardScaler
7
  from sklearn.model_selection import train_test_split
8
 
9
  dataset = pd.read_csv("data.csv")
10
 
11
+ # Dropping customerID which is unique for each customer
12
+ # and does not provide any useful information for prediction
13
  dataset.drop(columns=['customerID'], inplace=True)
14
 
15
+ # Encoding categorical variables and Scaling numerical variables
16
  encoder = LabelEncoder()
17
+ scaler = StandardScaler()
18
+ for column in dataset.select_dtypes(include=['int64', 'float64']).columns:
19
+ dataset[column] = scaler.fit_transform(dataset[column].values.reshape(-1, 1))
20
  for column in dataset.select_dtypes(include=['object']).columns:
21
+ dataset[column] = encoder.fit_transform(dataset[column])
22
+
23
+ # Plotting the correlation to find the most important features
24
+ fig, ax = plt.subplots(figsize=(16, 10))
25
+ corr=dataset.corr()["Churn"]
26
+ ax.set_xticklabels(corr.index, rotation=45, ha='right', fontsize=10)
27
+
28
+ sns.barplot(x=corr.index, y=corr.values, ax=ax)
29
+ plt.savefig("graphs/Correlation.png")
30
+
31
+ # Based on the correlation plot, the following features are removed
32
+ dataset.drop(columns=['gender', 'PhoneService', 'MultipleLines', 'InternetService', 'StreamingTV', 'StreamingMovies', 'TotalCharges'], inplace=True)
33
+ # Multivariate analysis
34
+ fig, ax = plt.subplots(figsize=(16, 10))
35
+ sns.heatmap(dataset.corr(), annot=True, fmt=".2f", cmap='coolwarm', ax=ax)
36
+ plt.savefig("graphs/Heatmap.png")
37
+
38
+ for column in dataset.columns:
39
+ fig, ax = plt.subplots(figsize=(16, 10))
40
+ sns.kdeplot(dataset[column],ax=ax)
41
+ plt.savefig(f"graphs/{column}.png")