Azubi_assignment / Azubi Africa.py
HOLYBOY's picture
PyFileAdded
baae561 verified
#!/usr/bin/env python
# coding: utf-8
# In[1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from imblearn.over_sampling import SMOTE
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
# In[3]:
# Load the datasets
file_paths = {
"bank_additional": "bank-additional.xlsx",
"bank_additional_full": "bank-additional-full.xlsx",
"bank_full": "bank-full.xlsx",
"bank": "bank.xlsx"
}
# In[6]:
# Reading the datasets into pandas dataframes
bank_additional = pd.read_excel(file_paths["bank_additional"])
# In[7]:
# Reading the datasets into pandas dataframes
bank_additional_full = pd.read_excel(file_paths["bank_additional_full"])
# In[8]:
# Reading the datasets into pandas dataframes
bank_full = pd.read_excel(file_paths["bank_full"])
# In[9]:
# Reading the datasets into pandas dataframes
bank = pd.read_excel(file_paths["bank"])
# In[10]:
# Displaying the first few rows and basic info for each dataset to understand their structure
datasets_info = {
"bank_additional": bank_additional.info(),
"bank_additional_full": bank_additional_full.info(),
"bank_full": bank_full.info(),
"bank": bank.info()
}
# In[11]:
bank_additional.head()
# In[12]:
bank_additional_full.head()
# In[13]:
bank_full.head()
# In[14]:
datasets_info
# In[15]:
# Using the bank_additional_full dataset for EDA
data = bank_additional_full.copy()
# In[16]:
# Checking for missing values
missing_values = data.isnull().sum()
# In[18]:
# Basic statistics
basic_stats = data.describe(include="all")
# In[22]:
# Basic statistics
basic_stats = data.describe(include="all")
missing_values, basic_stats
# In[19]:
# 1. Overview of the dataset
print("Dataset shape:", data.shape)
# In[20]:
print("\nDataset sample:\n", data.head())
# In[21]:
print("\nData types:\n", data.dtypes)
# In[22]:
# 2 Summary statistics
print("\nSummary statistics (numerical features):\n", data.describe())
# In[23]:
print("\nSummary statistics (categorical features):\n", data.describe(include=['object']))
# In[25]:
# 3. Correlation analysis (numerical features)
numerical_features = data.select_dtypes(include=['int64', 'float64']).columns
plt.figure(figsize=(10, 8))
sns.heatmap(data[numerical_features].corr(), annot=True, cmap='coolwarm', fmt=".2f")
plt.title('Correlation Matrix (Numerical Features)')
plt.show()
# In[26]:
# 4 Distribution of key numerical features
for col in numerical_features:
plt.figure(figsize=(6, 4))
sns.histplot(data[col], kde=True, bins=30)
plt.title(f'Distribution of {col}')
plt.xlabel(col)
plt.ylabel('Frequency')
plt.show()
# In[27]:
# 5 Boxplot to identify outliers
for col in numerical_features:
plt.figure(figsize=(6, 4))
sns.boxplot(data[col])
plt.title(f'Boxplot of {col}')
plt.xlabel(col)
plt.show()
# In[28]:
# 6 Relationship between key features and target
categorical_features = data.select_dtypes(include=['object']).columns
for col in categorical_features:
plt.figure(figsize=(10, 6))
sns.countplot(x=col, hue='y', data=data)
plt.title(f'{col} vs Subscription (y)')
plt.xlabel(col)
plt.ylabel('Count')
plt.legend(title='Subscription', loc='upper right')
plt.xticks(rotation=45)
plt.show()
# In[15]:
# 7 Visualizing target variable distribution
plt.figure(figsize=(8, 6))
sns.countplot(data=data, x='y', palette='coolwarm')
plt.title("Subscription Outcome Distribution (y)", fontsize=14)
plt.xlabel("Subscription ('yes' or 'no')")
plt.ylabel("Count")
plt.show()
# In[16]:
# 7 Correlation heatmap for numerical features
plt.figure(figsize=(10, 8))
numerical_cols = data.select_dtypes(include=['float64', 'int64']).columns
correlation_matrix = data[numerical_cols].corr()
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f")
plt.title("Correlation Heatmap for Numerical Features", fontsize=14)
plt.show()
# Summary of Findings from EDA:
# Data Integrity:
#
# There are no missing values across all features in the dataset.
# The target variable y (subscription) is imbalanced, with significantly more "no" than "yes" responses. Addressing this imbalance will be critical during model training.
# Numerical Feature Correlations:
#
# Features like euribor3m (3-month Euribor rate) and nr.employed (number of employees) exhibit strong correlations with other numerical variables, indicating potential predictive power.
# Key Statistics:
#
# Age ranges from 17 to 98, with a mean of ~40.
# Features such as pdays and previous show many default values (e.g., 999 for pdays), likely needing special handling.
# Next Steps:
# Data Preprocessing:
#
# Handle imbalanced classes using oversampling (e.g., SMOTE) or class weighting.
# Normalize numerical features for algorithms sensitive to feature scales.
# Encode categorical variables using techniques like one-hot encoding or label encoding.
# Feature Engineering:
#
# Evaluate feature importance.
# Consider interactions or derived metrics from existing features.
# Predictive Modeling:
#
# Train models like Logistic Regression, Random Forest, or Gradient Boosting.
# Use cross-validation to assess model performance using metrics such as F1 score due to the class imbalance.
# In[30]:
# Encode categorical features
categorical_columns = data.select_dtypes(include=['object']).columns
label_encoders = {}
for col in categorical_columns:
le = LabelEncoder()
data[col] = le.fit_transform(data[col])
label_encoders[col] = le
# In[31]:
# Split the data into features and target
X = data.drop('y', axis=1) # Assuming 'y' is the target column
y = data['y']
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
# In[32]:
# Apply SMOTE to handle class imbalance
smote = SMOTE(random_state=42)
X_train_balanced, y_train_balanced = smote.fit_resample(X_train, y_train)
# Scale numerical features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_balanced)
X_test_scaled = scaler.transform(X_test)
# In[33]:
# Train a Logistic Regression model
model = LogisticRegression(random_state=42)
model.fit(X_train_scaled, y_train_balanced)
# In[34]:
# Make predictions
y_pred = model.predict(X_test_scaled)
# In[35]:
# Evaluate the model
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
# Insights and Next Steps:
#
# Feature Importance: Logistic regression provides coefficients that indicate feature importance. Features with higher absolute coefficients contribute more to the prediction.
#
# Evaluation Metrics: The classification report provides accuracy, precision, recall, and F1 scores.
# In[ ]:
# In[ ]: