Spaces:

HOLYBOY
/

Azubi_assignment

No application file

File size: 7,073 Bytes

baae561

#!/usr/bin/env python
# coding: utf-8

# In[1]:


import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from imblearn.over_sampling import SMOTE
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score


# In[3]:


# Load the datasets 
file_paths = {
    "bank_additional": "bank-additional.xlsx",
    "bank_additional_full": "bank-additional-full.xlsx",
    "bank_full": "bank-full.xlsx",
    "bank": "bank.xlsx"
}


# In[6]:


# Reading the datasets into pandas dataframes
bank_additional = pd.read_excel(file_paths["bank_additional"])


# In[7]:


# Reading the datasets into pandas dataframes
bank_additional_full = pd.read_excel(file_paths["bank_additional_full"])


# In[8]:


# Reading the datasets into pandas dataframes
bank_full = pd.read_excel(file_paths["bank_full"])


# In[9]:


# Reading the datasets into pandas dataframes
bank = pd.read_excel(file_paths["bank"])


# In[10]:


# Displaying the first few rows and basic info for each dataset to understand their structure
datasets_info = {
    "bank_additional": bank_additional.info(),
    "bank_additional_full": bank_additional_full.info(),
    "bank_full": bank_full.info(),
    "bank": bank.info()
}


# In[11]:


bank_additional.head()


# In[12]:


bank_additional_full.head()


# In[13]:


bank_full.head()


# In[14]:


datasets_info


# In[15]:


# Using the bank_additional_full dataset for EDA
data = bank_additional_full.copy()


# In[16]:


# Checking for missing values
missing_values = data.isnull().sum()


# In[18]:


# Basic statistics
basic_stats = data.describe(include="all")


# In[22]:


# Basic statistics
basic_stats = data.describe(include="all")

missing_values, basic_stats


# In[19]:


# 1. Overview of the dataset
print("Dataset shape:", data.shape)


# In[20]:


print("\nDataset sample:\n", data.head())


# In[21]:


print("\nData types:\n", data.dtypes)


# In[22]:


# 2 Summary statistics
print("\nSummary statistics (numerical features):\n", data.describe())


# In[23]:


print("\nSummary statistics (categorical features):\n", data.describe(include=['object']))


# In[25]:


# 3. Correlation analysis (numerical features)
numerical_features = data.select_dtypes(include=['int64', 'float64']).columns
plt.figure(figsize=(10, 8))
sns.heatmap(data[numerical_features].corr(), annot=True, cmap='coolwarm', fmt=".2f")
plt.title('Correlation Matrix (Numerical Features)')
plt.show()


# In[26]:


# 4 Distribution of key numerical features
for col in numerical_features:
    plt.figure(figsize=(6, 4))
    sns.histplot(data[col], kde=True, bins=30)
    plt.title(f'Distribution of {col}')
    plt.xlabel(col)
    plt.ylabel('Frequency')
    plt.show()


# In[27]:


# 5 Boxplot to identify outliers
for col in numerical_features:
    plt.figure(figsize=(6, 4))
    sns.boxplot(data[col])
    plt.title(f'Boxplot of {col}')
    plt.xlabel(col)
    plt.show()


# In[28]:


# 6 Relationship between key features and target
categorical_features = data.select_dtypes(include=['object']).columns

for col in categorical_features:
    plt.figure(figsize=(10, 6))
    sns.countplot(x=col, hue='y', data=data)
    plt.title(f'{col} vs Subscription (y)')
    plt.xlabel(col)
    plt.ylabel('Count')
    plt.legend(title='Subscription', loc='upper right')
    plt.xticks(rotation=45)
    plt.show()


# In[15]:


# 7 Visualizing target variable distribution
plt.figure(figsize=(8, 6))
sns.countplot(data=data, x='y', palette='coolwarm')
plt.title("Subscription Outcome Distribution (y)", fontsize=14)
plt.xlabel("Subscription ('yes' or 'no')")
plt.ylabel("Count")
plt.show()


# In[16]:


# 7 Correlation heatmap for numerical features
plt.figure(figsize=(10, 8))
numerical_cols = data.select_dtypes(include=['float64', 'int64']).columns
correlation_matrix = data[numerical_cols].corr()
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f")
plt.title("Correlation Heatmap for Numerical Features", fontsize=14)
plt.show()


# Summary of Findings from EDA:
# Data Integrity:
# 
# There are no missing values across all features in the dataset.
# The target variable y (subscription) is imbalanced, with significantly more "no" than "yes" responses. Addressing this imbalance will be critical during model training.
# Numerical Feature Correlations:
# 
# Features like euribor3m (3-month Euribor rate) and nr.employed (number of employees) exhibit strong correlations with other numerical variables, indicating potential predictive power.
# Key Statistics:
# 
# Age ranges from 17 to 98, with a mean of ~40.
# Features such as pdays and previous show many default values (e.g., 999 for pdays), likely needing special handling.
# Next Steps:
# Data Preprocessing:
# 
# Handle imbalanced classes using oversampling (e.g., SMOTE) or class weighting.
# Normalize numerical features for algorithms sensitive to feature scales.
# Encode categorical variables using techniques like one-hot encoding or label encoding.
# Feature Engineering:
# 
# Evaluate feature importance.
# Consider interactions or derived metrics from existing features.
# Predictive Modeling:
# 
# Train models like Logistic Regression, Random Forest, or Gradient Boosting.
# Use cross-validation to assess model performance using metrics such as F1 score due to the class imbalance.

# In[30]:


# Encode categorical features
categorical_columns = data.select_dtypes(include=['object']).columns
label_encoders = {}
for col in categorical_columns:
    le = LabelEncoder()
    data[col] = le.fit_transform(data[col])
    label_encoders[col] = le


# In[31]:


# Split the data into features and target
X = data.drop('y', axis=1)  # Assuming 'y' is the target column
y = data['y']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)


# In[32]:


# Apply SMOTE to handle class imbalance
smote = SMOTE(random_state=42)
X_train_balanced, y_train_balanced = smote.fit_resample(X_train, y_train)

# Scale numerical features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_balanced)
X_test_scaled = scaler.transform(X_test)


# In[33]:


# Train a Logistic Regression model
model = LogisticRegression(random_state=42)
model.fit(X_train_scaled, y_train_balanced)


# In[34]:


# Make predictions
y_pred = model.predict(X_test_scaled)


# In[35]:


# Evaluate the model
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))


# Insights and Next Steps:
# 
# Feature Importance: Logistic regression provides coefficients that indicate feature importance. Features with higher absolute coefficients contribute more to the prediction.
# 
# Evaluation Metrics: The classification report provides accuracy, precision, recall, and F1 scores.

# In[ ]:





# In[ ]: