Spaces:
No application file
No application file
| #!/usr/bin/env python | |
| # coding: utf-8 | |
| # In[1]: | |
| import pandas as pd | |
| import matplotlib.pyplot as plt | |
| import seaborn as sns | |
| from sklearn.model_selection import train_test_split | |
| from sklearn.preprocessing import StandardScaler, LabelEncoder | |
| from imblearn.over_sampling import SMOTE | |
| from sklearn.linear_model import LogisticRegression | |
| from sklearn.metrics import classification_report, confusion_matrix, accuracy_score | |
| # In[3]: | |
| # Load the datasets | |
| file_paths = { | |
| "bank_additional": "bank-additional.xlsx", | |
| "bank_additional_full": "bank-additional-full.xlsx", | |
| "bank_full": "bank-full.xlsx", | |
| "bank": "bank.xlsx" | |
| } | |
| # In[6]: | |
| # Reading the datasets into pandas dataframes | |
| bank_additional = pd.read_excel(file_paths["bank_additional"]) | |
| # In[7]: | |
| # Reading the datasets into pandas dataframes | |
| bank_additional_full = pd.read_excel(file_paths["bank_additional_full"]) | |
| # In[8]: | |
| # Reading the datasets into pandas dataframes | |
| bank_full = pd.read_excel(file_paths["bank_full"]) | |
| # In[9]: | |
| # Reading the datasets into pandas dataframes | |
| bank = pd.read_excel(file_paths["bank"]) | |
| # In[10]: | |
| # Displaying the first few rows and basic info for each dataset to understand their structure | |
| datasets_info = { | |
| "bank_additional": bank_additional.info(), | |
| "bank_additional_full": bank_additional_full.info(), | |
| "bank_full": bank_full.info(), | |
| "bank": bank.info() | |
| } | |
| # In[11]: | |
| bank_additional.head() | |
| # In[12]: | |
| bank_additional_full.head() | |
| # In[13]: | |
| bank_full.head() | |
| # In[14]: | |
| datasets_info | |
| # In[15]: | |
| # Using the bank_additional_full dataset for EDA | |
| data = bank_additional_full.copy() | |
| # In[16]: | |
| # Checking for missing values | |
| missing_values = data.isnull().sum() | |
| # In[18]: | |
| # Basic statistics | |
| basic_stats = data.describe(include="all") | |
| # In[22]: | |
| # Basic statistics | |
| basic_stats = data.describe(include="all") | |
| missing_values, basic_stats | |
| # In[19]: | |
| # 1. Overview of the dataset | |
| print("Dataset shape:", data.shape) | |
| # In[20]: | |
| print("\nDataset sample:\n", data.head()) | |
| # In[21]: | |
| print("\nData types:\n", data.dtypes) | |
| # In[22]: | |
| # 2 Summary statistics | |
| print("\nSummary statistics (numerical features):\n", data.describe()) | |
| # In[23]: | |
| print("\nSummary statistics (categorical features):\n", data.describe(include=['object'])) | |
| # In[25]: | |
| # 3. Correlation analysis (numerical features) | |
| numerical_features = data.select_dtypes(include=['int64', 'float64']).columns | |
| plt.figure(figsize=(10, 8)) | |
| sns.heatmap(data[numerical_features].corr(), annot=True, cmap='coolwarm', fmt=".2f") | |
| plt.title('Correlation Matrix (Numerical Features)') | |
| plt.show() | |
| # In[26]: | |
| # 4 Distribution of key numerical features | |
| for col in numerical_features: | |
| plt.figure(figsize=(6, 4)) | |
| sns.histplot(data[col], kde=True, bins=30) | |
| plt.title(f'Distribution of {col}') | |
| plt.xlabel(col) | |
| plt.ylabel('Frequency') | |
| plt.show() | |
| # In[27]: | |
| # 5 Boxplot to identify outliers | |
| for col in numerical_features: | |
| plt.figure(figsize=(6, 4)) | |
| sns.boxplot(data[col]) | |
| plt.title(f'Boxplot of {col}') | |
| plt.xlabel(col) | |
| plt.show() | |
| # In[28]: | |
| # 6 Relationship between key features and target | |
| categorical_features = data.select_dtypes(include=['object']).columns | |
| for col in categorical_features: | |
| plt.figure(figsize=(10, 6)) | |
| sns.countplot(x=col, hue='y', data=data) | |
| plt.title(f'{col} vs Subscription (y)') | |
| plt.xlabel(col) | |
| plt.ylabel('Count') | |
| plt.legend(title='Subscription', loc='upper right') | |
| plt.xticks(rotation=45) | |
| plt.show() | |
| # In[15]: | |
| # 7 Visualizing target variable distribution | |
| plt.figure(figsize=(8, 6)) | |
| sns.countplot(data=data, x='y', palette='coolwarm') | |
| plt.title("Subscription Outcome Distribution (y)", fontsize=14) | |
| plt.xlabel("Subscription ('yes' or 'no')") | |
| plt.ylabel("Count") | |
| plt.show() | |
| # In[16]: | |
| # 7 Correlation heatmap for numerical features | |
| plt.figure(figsize=(10, 8)) | |
| numerical_cols = data.select_dtypes(include=['float64', 'int64']).columns | |
| correlation_matrix = data[numerical_cols].corr() | |
| sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f") | |
| plt.title("Correlation Heatmap for Numerical Features", fontsize=14) | |
| plt.show() | |
| # Summary of Findings from EDA: | |
| # Data Integrity: | |
| # | |
| # There are no missing values across all features in the dataset. | |
| # The target variable y (subscription) is imbalanced, with significantly more "no" than "yes" responses. Addressing this imbalance will be critical during model training. | |
| # Numerical Feature Correlations: | |
| # | |
| # Features like euribor3m (3-month Euribor rate) and nr.employed (number of employees) exhibit strong correlations with other numerical variables, indicating potential predictive power. | |
| # Key Statistics: | |
| # | |
| # Age ranges from 17 to 98, with a mean of ~40. | |
| # Features such as pdays and previous show many default values (e.g., 999 for pdays), likely needing special handling. | |
| # Next Steps: | |
| # Data Preprocessing: | |
| # | |
| # Handle imbalanced classes using oversampling (e.g., SMOTE) or class weighting. | |
| # Normalize numerical features for algorithms sensitive to feature scales. | |
| # Encode categorical variables using techniques like one-hot encoding or label encoding. | |
| # Feature Engineering: | |
| # | |
| # Evaluate feature importance. | |
| # Consider interactions or derived metrics from existing features. | |
| # Predictive Modeling: | |
| # | |
| # Train models like Logistic Regression, Random Forest, or Gradient Boosting. | |
| # Use cross-validation to assess model performance using metrics such as F1 score due to the class imbalance. | |
| # In[30]: | |
| # Encode categorical features | |
| categorical_columns = data.select_dtypes(include=['object']).columns | |
| label_encoders = {} | |
| for col in categorical_columns: | |
| le = LabelEncoder() | |
| data[col] = le.fit_transform(data[col]) | |
| label_encoders[col] = le | |
| # In[31]: | |
| # Split the data into features and target | |
| X = data.drop('y', axis=1) # Assuming 'y' is the target column | |
| y = data['y'] | |
| # Train-test split | |
| X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y) | |
| # In[32]: | |
| # Apply SMOTE to handle class imbalance | |
| smote = SMOTE(random_state=42) | |
| X_train_balanced, y_train_balanced = smote.fit_resample(X_train, y_train) | |
| # Scale numerical features | |
| scaler = StandardScaler() | |
| X_train_scaled = scaler.fit_transform(X_train_balanced) | |
| X_test_scaled = scaler.transform(X_test) | |
| # In[33]: | |
| # Train a Logistic Regression model | |
| model = LogisticRegression(random_state=42) | |
| model.fit(X_train_scaled, y_train_balanced) | |
| # In[34]: | |
| # Make predictions | |
| y_pred = model.predict(X_test_scaled) | |
| # In[35]: | |
| # Evaluate the model | |
| print("Accuracy:", accuracy_score(y_test, y_pred)) | |
| print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred)) | |
| print("\nClassification Report:\n", classification_report(y_test, y_pred)) | |
| # Insights and Next Steps: | |
| # | |
| # Feature Importance: Logistic regression provides coefficients that indicate feature importance. Features with higher absolute coefficients contribute more to the prediction. | |
| # | |
| # Evaluation Metrics: The classification report provides accuracy, precision, recall, and F1 scores. | |
| # In[ ]: | |
| # In[ ]: | |