Spaces:

HOLYBOY
/

Azubi_assignment

No application file

App Files Files Community

HOLYBOY commited on Dec 6, 2024

Commit

baae561

verified ·

1 Parent(s): 2b2790e

PyFileAdded

Browse files

Files changed (6) hide show

.gitattributes +2 -0
Azubi Africa.py +335 -0
bank-additional-full.xlsx +3 -0
bank-additional.xlsx +0 -0
bank-full.xlsx +3 -0
bank.xlsx +0 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+bank-additional-full.xlsx filter=lfs diff=lfs merge=lfs -text
+bank-full.xlsx filter=lfs diff=lfs merge=lfs -text

Azubi Africa.py ADDED Viewed

	@@ -0,0 +1,335 @@

+#!/usr/bin/env python
+# coding: utf-8
+# In[1]:
+import pandas as pd
+import matplotlib.pyplot as plt
+import seaborn as sns
+from sklearn.model_selection import train_test_split
+from sklearn.preprocessing import StandardScaler, LabelEncoder
+from imblearn.over_sampling import SMOTE
+from sklearn.linear_model import LogisticRegression
+from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
+# In[3]:
+# Load the datasets
+file_paths = {
+    "bank_additional": "bank-additional.xlsx",
+    "bank_additional_full": "bank-additional-full.xlsx",
+    "bank_full": "bank-full.xlsx",
+    "bank": "bank.xlsx"
+}
+# In[6]:
+# Reading the datasets into pandas dataframes
+bank_additional = pd.read_excel(file_paths["bank_additional"])
+# In[7]:
+# Reading the datasets into pandas dataframes
+bank_additional_full = pd.read_excel(file_paths["bank_additional_full"])
+# In[8]:
+# Reading the datasets into pandas dataframes
+bank_full = pd.read_excel(file_paths["bank_full"])
+# In[9]:
+# Reading the datasets into pandas dataframes
+bank = pd.read_excel(file_paths["bank"])
+# In[10]:
+# Displaying the first few rows and basic info for each dataset to understand their structure
+datasets_info = {
+    "bank_additional": bank_additional.info(),
+    "bank_additional_full": bank_additional_full.info(),
+    "bank_full": bank_full.info(),
+    "bank": bank.info()
+}
+# In[11]:
+bank_additional.head()
+# In[12]:
+bank_additional_full.head()
+# In[13]:
+bank_full.head()
+# In[14]:
+datasets_info
+# In[15]:
+# Using the bank_additional_full dataset for EDA
+data = bank_additional_full.copy()
+# In[16]:
+# Checking for missing values
+missing_values = data.isnull().sum()
+# In[18]:
+# Basic statistics
+basic_stats = data.describe(include="all")
+# In[22]:
+# Basic statistics
+basic_stats = data.describe(include="all")
+missing_values, basic_stats
+# In[19]:
+# 1. Overview of the dataset
+print("Dataset shape:", data.shape)
+# In[20]:
+print("\nDataset sample:\n", data.head())
+# In[21]:
+print("\nData types:\n", data.dtypes)
+# In[22]:
+# 2 Summary statistics
+print("\nSummary statistics (numerical features):\n", data.describe())
+# In[23]:
+print("\nSummary statistics (categorical features):\n", data.describe(include=['object']))
+# In[25]:
+# 3. Correlation analysis (numerical features)
+numerical_features = data.select_dtypes(include=['int64', 'float64']).columns
+plt.figure(figsize=(10, 8))
+sns.heatmap(data[numerical_features].corr(), annot=True, cmap='coolwarm', fmt=".2f")
+plt.title('Correlation Matrix (Numerical Features)')
+plt.show()
+# In[26]:
+# 4 Distribution of key numerical features
+for col in numerical_features:
+    plt.figure(figsize=(6, 4))
+    sns.histplot(data[col], kde=True, bins=30)
+    plt.title(f'Distribution of {col}')
+    plt.xlabel(col)
+    plt.ylabel('Frequency')
+    plt.show()
+# In[27]:
+# 5 Boxplot to identify outliers
+for col in numerical_features:
+    plt.figure(figsize=(6, 4))
+    sns.boxplot(data[col])
+    plt.title(f'Boxplot of {col}')
+    plt.xlabel(col)
+    plt.show()
+# In[28]:
+# 6 Relationship between key features and target
+categorical_features = data.select_dtypes(include=['object']).columns
+for col in categorical_features:
+    plt.figure(figsize=(10, 6))
+    sns.countplot(x=col, hue='y', data=data)
+    plt.title(f'{col} vs Subscription (y)')
+    plt.xlabel(col)
+    plt.ylabel('Count')
+    plt.legend(title='Subscription', loc='upper right')
+    plt.xticks(rotation=45)
+    plt.show()
+# In[15]:
+# 7 Visualizing target variable distribution
+plt.figure(figsize=(8, 6))
+sns.countplot(data=data, x='y', palette='coolwarm')
+plt.title("Subscription Outcome Distribution (y)", fontsize=14)
+plt.xlabel("Subscription ('yes' or 'no')")
+plt.ylabel("Count")
+plt.show()
+# In[16]:
+# 7 Correlation heatmap for numerical features
+plt.figure(figsize=(10, 8))
+numerical_cols = data.select_dtypes(include=['float64', 'int64']).columns
+correlation_matrix = data[numerical_cols].corr()
+sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f")
+plt.title("Correlation Heatmap for Numerical Features", fontsize=14)
+plt.show()
+# Summary of Findings from EDA:
+# Data Integrity:
+#
+# There are no missing values across all features in the dataset.
+# The target variable y (subscription) is imbalanced, with significantly more "no" than "yes" responses. Addressing this imbalance will be critical during model training.
+# Numerical Feature Correlations:
+#
+# Features like euribor3m (3-month Euribor rate) and nr.employed (number of employees) exhibit strong correlations with other numerical variables, indicating potential predictive power.
+# Key Statistics:
+#
+# Age ranges from 17 to 98, with a mean of ~40.
+# Features such as pdays and previous show many default values (e.g., 999 for pdays), likely needing special handling.
+# Next Steps:
+# Data Preprocessing:
+#
+# Handle imbalanced classes using oversampling (e.g., SMOTE) or class weighting.
+# Normalize numerical features for algorithms sensitive to feature scales.
+# Encode categorical variables using techniques like one-hot encoding or label encoding.
+# Feature Engineering:
+#
+# Evaluate feature importance.
+# Consider interactions or derived metrics from existing features.
+# Predictive Modeling:
+#
+# Train models like Logistic Regression, Random Forest, or Gradient Boosting.
+# Use cross-validation to assess model performance using metrics such as F1 score due to the class imbalance.
+# In[30]:
+# Encode categorical features
+categorical_columns = data.select_dtypes(include=['object']).columns
+label_encoders = {}
+for col in categorical_columns:
+    le = LabelEncoder()
+    data[col] = le.fit_transform(data[col])
+    label_encoders[col] = le
+# In[31]:
+# Split the data into features and target
+X = data.drop('y', axis=1)  # Assuming 'y' is the target column
+y = data['y']
+# Train-test split
+X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
+# In[32]:
+# Apply SMOTE to handle class imbalance
+smote = SMOTE(random_state=42)
+X_train_balanced, y_train_balanced = smote.fit_resample(X_train, y_train)
+# Scale numerical features
+scaler = StandardScaler()
+X_train_scaled = scaler.fit_transform(X_train_balanced)
+X_test_scaled = scaler.transform(X_test)
+# In[33]:
+# Train a Logistic Regression model
+model = LogisticRegression(random_state=42)
+model.fit(X_train_scaled, y_train_balanced)
+# In[34]:
+# Make predictions
+y_pred = model.predict(X_test_scaled)
+# In[35]:
+# Evaluate the model
+print("Accuracy:", accuracy_score(y_test, y_pred))
+print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))
+print("\nClassification Report:\n", classification_report(y_test, y_pred))
+# Insights and Next Steps:
+#
+# Feature Importance: Logistic regression provides coefficients that indicate feature importance. Features with higher absolute coefficients contribute more to the prediction.
+#
+# Evaluation Metrics: The classification report provides accuracy, precision, recall, and F1 scores.
+# In[ ]:
+# In[ ]:

bank-additional-full.xlsx ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e746abde169e0ce0e0410d1d8eb35bb96c75ce7b93d4d4008f623ccd0ba1b57b
+size 3582419

bank-additional.xlsx ADDED Viewed

Binary file (416 kB). View file

bank-full.xlsx ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e6c298895827c34e1db9e8f57b557eeed3ba146edab66d60c031af17d0faf1cc
+size 3410864

bank.xlsx ADDED Viewed

Binary file (360 kB). View file