#!/usr/bin/env python # coding: utf-8 # In[1]: import pandas as pd import matplotlib.pyplot as plt import seaborn as sns from sklearn.model_selection import train_test_split from sklearn.preprocessing import StandardScaler, LabelEncoder from imblearn.over_sampling import SMOTE from sklearn.linear_model import LogisticRegression from sklearn.metrics import classification_report, confusion_matrix, accuracy_score # In[3]: # Load the datasets file_paths = { "bank_additional": "bank-additional.xlsx", "bank_additional_full": "bank-additional-full.xlsx", "bank_full": "bank-full.xlsx", "bank": "bank.xlsx" } # In[6]: # Reading the datasets into pandas dataframes bank_additional = pd.read_excel(file_paths["bank_additional"]) # In[7]: # Reading the datasets into pandas dataframes bank_additional_full = pd.read_excel(file_paths["bank_additional_full"]) # In[8]: # Reading the datasets into pandas dataframes bank_full = pd.read_excel(file_paths["bank_full"]) # In[9]: # Reading the datasets into pandas dataframes bank = pd.read_excel(file_paths["bank"]) # In[10]: # Displaying the first few rows and basic info for each dataset to understand their structure datasets_info = { "bank_additional": bank_additional.info(), "bank_additional_full": bank_additional_full.info(), "bank_full": bank_full.info(), "bank": bank.info() } # In[11]: bank_additional.head() # In[12]: bank_additional_full.head() # In[13]: bank_full.head() # In[14]: datasets_info # In[15]: # Using the bank_additional_full dataset for EDA data = bank_additional_full.copy() # In[16]: # Checking for missing values missing_values = data.isnull().sum() # In[18]: # Basic statistics basic_stats = data.describe(include="all") # In[22]: # Basic statistics basic_stats = data.describe(include="all") missing_values, basic_stats # In[19]: # 1. Overview of the dataset print("Dataset shape:", data.shape) # In[20]: print("\nDataset sample:\n", data.head()) # In[21]: print("\nData types:\n", data.dtypes) # In[22]: # 2 Summary statistics print("\nSummary statistics (numerical features):\n", data.describe()) # In[23]: print("\nSummary statistics (categorical features):\n", data.describe(include=['object'])) # In[25]: # 3. Correlation analysis (numerical features) numerical_features = data.select_dtypes(include=['int64', 'float64']).columns plt.figure(figsize=(10, 8)) sns.heatmap(data[numerical_features].corr(), annot=True, cmap='coolwarm', fmt=".2f") plt.title('Correlation Matrix (Numerical Features)') plt.show() # In[26]: # 4 Distribution of key numerical features for col in numerical_features: plt.figure(figsize=(6, 4)) sns.histplot(data[col], kde=True, bins=30) plt.title(f'Distribution of {col}') plt.xlabel(col) plt.ylabel('Frequency') plt.show() # In[27]: # 5 Boxplot to identify outliers for col in numerical_features: plt.figure(figsize=(6, 4)) sns.boxplot(data[col]) plt.title(f'Boxplot of {col}') plt.xlabel(col) plt.show() # In[28]: # 6 Relationship between key features and target categorical_features = data.select_dtypes(include=['object']).columns for col in categorical_features: plt.figure(figsize=(10, 6)) sns.countplot(x=col, hue='y', data=data) plt.title(f'{col} vs Subscription (y)') plt.xlabel(col) plt.ylabel('Count') plt.legend(title='Subscription', loc='upper right') plt.xticks(rotation=45) plt.show() # In[15]: # 7 Visualizing target variable distribution plt.figure(figsize=(8, 6)) sns.countplot(data=data, x='y', palette='coolwarm') plt.title("Subscription Outcome Distribution (y)", fontsize=14) plt.xlabel("Subscription ('yes' or 'no')") plt.ylabel("Count") plt.show() # In[16]: # 7 Correlation heatmap for numerical features plt.figure(figsize=(10, 8)) numerical_cols = data.select_dtypes(include=['float64', 'int64']).columns correlation_matrix = data[numerical_cols].corr() sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f") plt.title("Correlation Heatmap for Numerical Features", fontsize=14) plt.show() # Summary of Findings from EDA: # Data Integrity: # # There are no missing values across all features in the dataset. # The target variable y (subscription) is imbalanced, with significantly more "no" than "yes" responses. Addressing this imbalance will be critical during model training. # Numerical Feature Correlations: # # Features like euribor3m (3-month Euribor rate) and nr.employed (number of employees) exhibit strong correlations with other numerical variables, indicating potential predictive power. # Key Statistics: # # Age ranges from 17 to 98, with a mean of ~40. # Features such as pdays and previous show many default values (e.g., 999 for pdays), likely needing special handling. # Next Steps: # Data Preprocessing: # # Handle imbalanced classes using oversampling (e.g., SMOTE) or class weighting. # Normalize numerical features for algorithms sensitive to feature scales. # Encode categorical variables using techniques like one-hot encoding or label encoding. # Feature Engineering: # # Evaluate feature importance. # Consider interactions or derived metrics from existing features. # Predictive Modeling: # # Train models like Logistic Regression, Random Forest, or Gradient Boosting. # Use cross-validation to assess model performance using metrics such as F1 score due to the class imbalance. # In[30]: # Encode categorical features categorical_columns = data.select_dtypes(include=['object']).columns label_encoders = {} for col in categorical_columns: le = LabelEncoder() data[col] = le.fit_transform(data[col]) label_encoders[col] = le # In[31]: # Split the data into features and target X = data.drop('y', axis=1) # Assuming 'y' is the target column y = data['y'] # Train-test split X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y) # In[32]: # Apply SMOTE to handle class imbalance smote = SMOTE(random_state=42) X_train_balanced, y_train_balanced = smote.fit_resample(X_train, y_train) # Scale numerical features scaler = StandardScaler() X_train_scaled = scaler.fit_transform(X_train_balanced) X_test_scaled = scaler.transform(X_test) # In[33]: # Train a Logistic Regression model model = LogisticRegression(random_state=42) model.fit(X_train_scaled, y_train_balanced) # In[34]: # Make predictions y_pred = model.predict(X_test_scaled) # In[35]: # Evaluate the model print("Accuracy:", accuracy_score(y_test, y_pred)) print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred)) print("\nClassification Report:\n", classification_report(y_test, y_pred)) # Insights and Next Steps: # # Feature Importance: Logistic regression provides coefficients that indicate feature importance. Features with higher absolute coefficients contribute more to the prediction. # # Evaluation Metrics: The classification report provides accuracy, precision, recall, and F1 scores. # In[ ]: # In[ ]: