Spaces:

HOLYBOY
/

Azubi_assignment

No application file

App Files Files Community

Azubi_assignment / Azubi Africa.py

HOLYBOY

PyFileAdded

baae561 verified about 1 year ago

raw

history blame contribute delete

7.07 kB

	#!/usr/bin/env python
	# coding: utf-8

	# In[1]:


	import pandas as pd
	import matplotlib.pyplot as plt
	import seaborn as sns
	from sklearn.model_selection import train_test_split
	from sklearn.preprocessing import StandardScaler, LabelEncoder
	from imblearn.over_sampling import SMOTE
	from sklearn.linear_model import LogisticRegression
	from sklearn.metrics import classification_report, confusion_matrix, accuracy_score


	# In[3]:


	# Load the datasets
	file_paths = {
	"bank_additional": "bank-additional.xlsx",
	"bank_additional_full": "bank-additional-full.xlsx",
	"bank_full": "bank-full.xlsx",
	"bank": "bank.xlsx"
	}


	# In[6]:


	# Reading the datasets into pandas dataframes
	bank_additional = pd.read_excel(file_paths["bank_additional"])


	# In[7]:


	# Reading the datasets into pandas dataframes
	bank_additional_full = pd.read_excel(file_paths["bank_additional_full"])


	# In[8]:


	# Reading the datasets into pandas dataframes
	bank_full = pd.read_excel(file_paths["bank_full"])


	# In[9]:


	# Reading the datasets into pandas dataframes
	bank = pd.read_excel(file_paths["bank"])


	# In[10]:


	# Displaying the first few rows and basic info for each dataset to understand their structure
	datasets_info = {
	"bank_additional": bank_additional.info(),
	"bank_additional_full": bank_additional_full.info(),
	"bank_full": bank_full.info(),
	"bank": bank.info()
	}


	# In[11]:


	bank_additional.head()


	# In[12]:


	bank_additional_full.head()


	# In[13]:


	bank_full.head()


	# In[14]:


	datasets_info


	# In[15]:


	# Using the bank_additional_full dataset for EDA
	data = bank_additional_full.copy()


	# In[16]:


	# Checking for missing values
	missing_values = data.isnull().sum()


	# In[18]:


	# Basic statistics
	basic_stats = data.describe(include="all")


	# In[22]:


	# Basic statistics
	basic_stats = data.describe(include="all")

	missing_values, basic_stats


	# In[19]:


	# 1. Overview of the dataset
	print("Dataset shape:", data.shape)


	# In[20]:


	print("\nDataset sample:\n", data.head())


	# In[21]:


	print("\nData types:\n", data.dtypes)


	# In[22]:


	# 2 Summary statistics
	print("\nSummary statistics (numerical features):\n", data.describe())


	# In[23]:


	print("\nSummary statistics (categorical features):\n", data.describe(include=['object']))


	# In[25]:


	# 3. Correlation analysis (numerical features)
	numerical_features = data.select_dtypes(include=['int64', 'float64']).columns
	plt.figure(figsize=(10, 8))
	sns.heatmap(data[numerical_features].corr(), annot=True, cmap='coolwarm', fmt=".2f")
	plt.title('Correlation Matrix (Numerical Features)')
	plt.show()


	# In[26]:


	# 4 Distribution of key numerical features
	for col in numerical_features:
	plt.figure(figsize=(6, 4))
	sns.histplot(data[col], kde=True, bins=30)
	plt.title(f'Distribution of {col}')
	plt.xlabel(col)
	plt.ylabel('Frequency')
	plt.show()


	# In[27]:


	# 5 Boxplot to identify outliers
	for col in numerical_features:
	plt.figure(figsize=(6, 4))
	sns.boxplot(data[col])
	plt.title(f'Boxplot of {col}')
	plt.xlabel(col)
	plt.show()


	# In[28]:


	# 6 Relationship between key features and target
	categorical_features = data.select_dtypes(include=['object']).columns

	for col in categorical_features:
	plt.figure(figsize=(10, 6))
	sns.countplot(x=col, hue='y', data=data)
	plt.title(f'{col} vs Subscription (y)')
	plt.xlabel(col)
	plt.ylabel('Count')
	plt.legend(title='Subscription', loc='upper right')
	plt.xticks(rotation=45)
	plt.show()


	# In[15]:


	# 7 Visualizing target variable distribution
	plt.figure(figsize=(8, 6))
	sns.countplot(data=data, x='y', palette='coolwarm')
	plt.title("Subscription Outcome Distribution (y)", fontsize=14)
	plt.xlabel("Subscription ('yes' or 'no')")
	plt.ylabel("Count")
	plt.show()


	# In[16]:


	# 7 Correlation heatmap for numerical features
	plt.figure(figsize=(10, 8))
	numerical_cols = data.select_dtypes(include=['float64', 'int64']).columns
	correlation_matrix = data[numerical_cols].corr()
	sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f")
	plt.title("Correlation Heatmap for Numerical Features", fontsize=14)
	plt.show()


	# Summary of Findings from EDA:
	# Data Integrity:
	#
	# There are no missing values across all features in the dataset.
	# The target variable y (subscription) is imbalanced, with significantly more "no" than "yes" responses. Addressing this imbalance will be critical during model training.
	# Numerical Feature Correlations:
	#
	# Features like euribor3m (3-month Euribor rate) and nr.employed (number of employees) exhibit strong correlations with other numerical variables, indicating potential predictive power.
	# Key Statistics:
	#
	# Age ranges from 17 to 98, with a mean of ~40.
	# Features such as pdays and previous show many default values (e.g., 999 for pdays), likely needing special handling.
	# Next Steps:
	# Data Preprocessing:
	#
	# Handle imbalanced classes using oversampling (e.g., SMOTE) or class weighting.
	# Normalize numerical features for algorithms sensitive to feature scales.
	# Encode categorical variables using techniques like one-hot encoding or label encoding.
	# Feature Engineering:
	#
	# Evaluate feature importance.
	# Consider interactions or derived metrics from existing features.
	# Predictive Modeling:
	#
	# Train models like Logistic Regression, Random Forest, or Gradient Boosting.
	# Use cross-validation to assess model performance using metrics such as F1 score due to the class imbalance.

	# In[30]:


	# Encode categorical features
	categorical_columns = data.select_dtypes(include=['object']).columns
	label_encoders = {}
	for col in categorical_columns:
	le = LabelEncoder()
	data[col] = le.fit_transform(data[col])
	label_encoders[col] = le


	# In[31]:


	# Split the data into features and target
	X = data.drop('y', axis=1) # Assuming 'y' is the target column
	y = data['y']

	# Train-test split
	X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)


	# In[32]:


	# Apply SMOTE to handle class imbalance
	smote = SMOTE(random_state=42)
	X_train_balanced, y_train_balanced = smote.fit_resample(X_train, y_train)

	# Scale numerical features
	scaler = StandardScaler()
	X_train_scaled = scaler.fit_transform(X_train_balanced)
	X_test_scaled = scaler.transform(X_test)


	# In[33]:


	# Train a Logistic Regression model
	model = LogisticRegression(random_state=42)
	model.fit(X_train_scaled, y_train_balanced)


	# In[34]:


	# Make predictions
	y_pred = model.predict(X_test_scaled)


	# In[35]:


	# Evaluate the model
	print("Accuracy:", accuracy_score(y_test, y_pred))
	print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))
	print("\nClassification Report:\n", classification_report(y_test, y_pred))


	# Insights and Next Steps:
	#
	# Feature Importance: Logistic regression provides coefficients that indicate feature importance. Features with higher absolute coefficients contribute more to the prediction.
	#
	# Evaluation Metrics: The classification report provides accuracy, precision, recall, and F1 scores.

	# In[ ]:





	# In[ ]: