HOLYBOY commited on
Commit
baae561
·
verified ·
1 Parent(s): 2b2790e

PyFileAdded

Browse files
.gitattributes CHANGED
@@ -33,3 +33,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ bank-additional-full.xlsx filter=lfs diff=lfs merge=lfs -text
37
+ bank-full.xlsx filter=lfs diff=lfs merge=lfs -text
Azubi Africa.py ADDED
@@ -0,0 +1,335 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python
2
+ # coding: utf-8
3
+
4
+ # In[1]:
5
+
6
+
7
+ import pandas as pd
8
+ import matplotlib.pyplot as plt
9
+ import seaborn as sns
10
+ from sklearn.model_selection import train_test_split
11
+ from sklearn.preprocessing import StandardScaler, LabelEncoder
12
+ from imblearn.over_sampling import SMOTE
13
+ from sklearn.linear_model import LogisticRegression
14
+ from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
15
+
16
+
17
+ # In[3]:
18
+
19
+
20
+ # Load the datasets
21
+ file_paths = {
22
+ "bank_additional": "bank-additional.xlsx",
23
+ "bank_additional_full": "bank-additional-full.xlsx",
24
+ "bank_full": "bank-full.xlsx",
25
+ "bank": "bank.xlsx"
26
+ }
27
+
28
+
29
+ # In[6]:
30
+
31
+
32
+ # Reading the datasets into pandas dataframes
33
+ bank_additional = pd.read_excel(file_paths["bank_additional"])
34
+
35
+
36
+ # In[7]:
37
+
38
+
39
+ # Reading the datasets into pandas dataframes
40
+ bank_additional_full = pd.read_excel(file_paths["bank_additional_full"])
41
+
42
+
43
+ # In[8]:
44
+
45
+
46
+ # Reading the datasets into pandas dataframes
47
+ bank_full = pd.read_excel(file_paths["bank_full"])
48
+
49
+
50
+ # In[9]:
51
+
52
+
53
+ # Reading the datasets into pandas dataframes
54
+ bank = pd.read_excel(file_paths["bank"])
55
+
56
+
57
+ # In[10]:
58
+
59
+
60
+ # Displaying the first few rows and basic info for each dataset to understand their structure
61
+ datasets_info = {
62
+ "bank_additional": bank_additional.info(),
63
+ "bank_additional_full": bank_additional_full.info(),
64
+ "bank_full": bank_full.info(),
65
+ "bank": bank.info()
66
+ }
67
+
68
+
69
+ # In[11]:
70
+
71
+
72
+ bank_additional.head()
73
+
74
+
75
+ # In[12]:
76
+
77
+
78
+ bank_additional_full.head()
79
+
80
+
81
+ # In[13]:
82
+
83
+
84
+ bank_full.head()
85
+
86
+
87
+ # In[14]:
88
+
89
+
90
+ datasets_info
91
+
92
+
93
+ # In[15]:
94
+
95
+
96
+ # Using the bank_additional_full dataset for EDA
97
+ data = bank_additional_full.copy()
98
+
99
+
100
+ # In[16]:
101
+
102
+
103
+ # Checking for missing values
104
+ missing_values = data.isnull().sum()
105
+
106
+
107
+ # In[18]:
108
+
109
+
110
+ # Basic statistics
111
+ basic_stats = data.describe(include="all")
112
+
113
+
114
+ # In[22]:
115
+
116
+
117
+ # Basic statistics
118
+ basic_stats = data.describe(include="all")
119
+
120
+ missing_values, basic_stats
121
+
122
+
123
+ # In[19]:
124
+
125
+
126
+ # 1. Overview of the dataset
127
+ print("Dataset shape:", data.shape)
128
+
129
+
130
+ # In[20]:
131
+
132
+
133
+ print("\nDataset sample:\n", data.head())
134
+
135
+
136
+ # In[21]:
137
+
138
+
139
+ print("\nData types:\n", data.dtypes)
140
+
141
+
142
+ # In[22]:
143
+
144
+
145
+ # 2 Summary statistics
146
+ print("\nSummary statistics (numerical features):\n", data.describe())
147
+
148
+
149
+ # In[23]:
150
+
151
+
152
+ print("\nSummary statistics (categorical features):\n", data.describe(include=['object']))
153
+
154
+
155
+ # In[25]:
156
+
157
+
158
+ # 3. Correlation analysis (numerical features)
159
+ numerical_features = data.select_dtypes(include=['int64', 'float64']).columns
160
+ plt.figure(figsize=(10, 8))
161
+ sns.heatmap(data[numerical_features].corr(), annot=True, cmap='coolwarm', fmt=".2f")
162
+ plt.title('Correlation Matrix (Numerical Features)')
163
+ plt.show()
164
+
165
+
166
+ # In[26]:
167
+
168
+
169
+ # 4 Distribution of key numerical features
170
+ for col in numerical_features:
171
+ plt.figure(figsize=(6, 4))
172
+ sns.histplot(data[col], kde=True, bins=30)
173
+ plt.title(f'Distribution of {col}')
174
+ plt.xlabel(col)
175
+ plt.ylabel('Frequency')
176
+ plt.show()
177
+
178
+
179
+ # In[27]:
180
+
181
+
182
+ # 5 Boxplot to identify outliers
183
+ for col in numerical_features:
184
+ plt.figure(figsize=(6, 4))
185
+ sns.boxplot(data[col])
186
+ plt.title(f'Boxplot of {col}')
187
+ plt.xlabel(col)
188
+ plt.show()
189
+
190
+
191
+ # In[28]:
192
+
193
+
194
+ # 6 Relationship between key features and target
195
+ categorical_features = data.select_dtypes(include=['object']).columns
196
+
197
+ for col in categorical_features:
198
+ plt.figure(figsize=(10, 6))
199
+ sns.countplot(x=col, hue='y', data=data)
200
+ plt.title(f'{col} vs Subscription (y)')
201
+ plt.xlabel(col)
202
+ plt.ylabel('Count')
203
+ plt.legend(title='Subscription', loc='upper right')
204
+ plt.xticks(rotation=45)
205
+ plt.show()
206
+
207
+
208
+ # In[15]:
209
+
210
+
211
+ # 7 Visualizing target variable distribution
212
+ plt.figure(figsize=(8, 6))
213
+ sns.countplot(data=data, x='y', palette='coolwarm')
214
+ plt.title("Subscription Outcome Distribution (y)", fontsize=14)
215
+ plt.xlabel("Subscription ('yes' or 'no')")
216
+ plt.ylabel("Count")
217
+ plt.show()
218
+
219
+
220
+ # In[16]:
221
+
222
+
223
+ # 7 Correlation heatmap for numerical features
224
+ plt.figure(figsize=(10, 8))
225
+ numerical_cols = data.select_dtypes(include=['float64', 'int64']).columns
226
+ correlation_matrix = data[numerical_cols].corr()
227
+ sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f")
228
+ plt.title("Correlation Heatmap for Numerical Features", fontsize=14)
229
+ plt.show()
230
+
231
+
232
+ # Summary of Findings from EDA:
233
+ # Data Integrity:
234
+ #
235
+ # There are no missing values across all features in the dataset.
236
+ # The target variable y (subscription) is imbalanced, with significantly more "no" than "yes" responses. Addressing this imbalance will be critical during model training.
237
+ # Numerical Feature Correlations:
238
+ #
239
+ # Features like euribor3m (3-month Euribor rate) and nr.employed (number of employees) exhibit strong correlations with other numerical variables, indicating potential predictive power.
240
+ # Key Statistics:
241
+ #
242
+ # Age ranges from 17 to 98, with a mean of ~40.
243
+ # Features such as pdays and previous show many default values (e.g., 999 for pdays), likely needing special handling.
244
+ # Next Steps:
245
+ # Data Preprocessing:
246
+ #
247
+ # Handle imbalanced classes using oversampling (e.g., SMOTE) or class weighting.
248
+ # Normalize numerical features for algorithms sensitive to feature scales.
249
+ # Encode categorical variables using techniques like one-hot encoding or label encoding.
250
+ # Feature Engineering:
251
+ #
252
+ # Evaluate feature importance.
253
+ # Consider interactions or derived metrics from existing features.
254
+ # Predictive Modeling:
255
+ #
256
+ # Train models like Logistic Regression, Random Forest, or Gradient Boosting.
257
+ # Use cross-validation to assess model performance using metrics such as F1 score due to the class imbalance.
258
+
259
+ # In[30]:
260
+
261
+
262
+ # Encode categorical features
263
+ categorical_columns = data.select_dtypes(include=['object']).columns
264
+ label_encoders = {}
265
+ for col in categorical_columns:
266
+ le = LabelEncoder()
267
+ data[col] = le.fit_transform(data[col])
268
+ label_encoders[col] = le
269
+
270
+
271
+ # In[31]:
272
+
273
+
274
+ # Split the data into features and target
275
+ X = data.drop('y', axis=1) # Assuming 'y' is the target column
276
+ y = data['y']
277
+
278
+ # Train-test split
279
+ X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
280
+
281
+
282
+ # In[32]:
283
+
284
+
285
+ # Apply SMOTE to handle class imbalance
286
+ smote = SMOTE(random_state=42)
287
+ X_train_balanced, y_train_balanced = smote.fit_resample(X_train, y_train)
288
+
289
+ # Scale numerical features
290
+ scaler = StandardScaler()
291
+ X_train_scaled = scaler.fit_transform(X_train_balanced)
292
+ X_test_scaled = scaler.transform(X_test)
293
+
294
+
295
+ # In[33]:
296
+
297
+
298
+ # Train a Logistic Regression model
299
+ model = LogisticRegression(random_state=42)
300
+ model.fit(X_train_scaled, y_train_balanced)
301
+
302
+
303
+ # In[34]:
304
+
305
+
306
+ # Make predictions
307
+ y_pred = model.predict(X_test_scaled)
308
+
309
+
310
+ # In[35]:
311
+
312
+
313
+ # Evaluate the model
314
+ print("Accuracy:", accuracy_score(y_test, y_pred))
315
+ print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))
316
+ print("\nClassification Report:\n", classification_report(y_test, y_pred))
317
+
318
+
319
+ # Insights and Next Steps:
320
+ #
321
+ # Feature Importance: Logistic regression provides coefficients that indicate feature importance. Features with higher absolute coefficients contribute more to the prediction.
322
+ #
323
+ # Evaluation Metrics: The classification report provides accuracy, precision, recall, and F1 scores.
324
+
325
+ # In[ ]:
326
+
327
+
328
+
329
+
330
+
331
+ # In[ ]:
332
+
333
+
334
+
335
+
bank-additional-full.xlsx ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e746abde169e0ce0e0410d1d8eb35bb96c75ce7b93d4d4008f623ccd0ba1b57b
3
+ size 3582419
bank-additional.xlsx ADDED
Binary file (416 kB). View file
 
bank-full.xlsx ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e6c298895827c34e1db9e8f57b557eeed3ba146edab66d60c031af17d0faf1cc
3
+ size 3410864
bank.xlsx ADDED
Binary file (360 kB). View file