QuietML / monoMNB /QuietML_feature_engineering.py
drnull03's picture
QuietML Version 1.0
31c93e2
# Import Libraries
# Importing Numpy & Pandas for data processing & data wrangling
import numpy as np
import pandas as pd
# Importing tools for visualization
import matplotlib.pyplot as plt
import seaborn as sns
# Import evaluation metric libraries
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, roc_curve, classification_report
# Word Cloud library
from wordcloud import WordCloud, STOPWORDS
# Library used for data preprocessing
from sklearn.feature_extraction.text import CountVectorizer
# Import model selection libraries
from sklearn.model_selection import train_test_split
# Library used for ML Model implementation
from sklearn.naive_bayes import MultinomialNB
# Importing the Pipeline class from scikit-learn
from sklearn.pipeline import Pipeline
# Library used for ignore warnings
import warnings
warnings.filterwarnings('ignore')
#%matplotlib inline
#knowing the current path
import os
print(os.system("pwd"))
# Load Dataset
df = pd.read_csv("./data/spam.csv", encoding='ISO-8859-1')
print(df.head())
# Dataset Rows & Columns count
# Checking number of rows and columns of the dataset using shape
print("Number of rows are: ",df.shape[0])
print("Number of columns are: ",df.shape[1])
# Dataset Info
# Checking information about the dataset using info
df.info()
# Dataset Duplicate Value Count
dup = df.duplicated().sum()
print(f'number of duplicated rows are {dup}')
# Missing Values/Null Values Count
print(df.isnull().sum())
# Dataset Columns
print(df.columns)
# Dataset Describe (all columns included)
print(df.describe(include= 'all').round(2))
# Check Unique Values for each variable using a for loop.
for i in df.columns.tolist():
print("No. of unique values in",i,"is",df[i].nunique())
# Change the v1 & v2 columns as Category and Message
df.rename(columns={"v1": "Category", "v2": "Message"}, inplace=True)
# Removing the all unnamed columns (its include much number of missing values)
df.drop(columns={'Unnamed: 2','Unnamed: 3','Unnamed: 4'}, inplace=True)
# Create a binary 'Spam' column: 1 for 'spam' and 0 for 'ham', based on the 'Category' column.
df['Spam'] = df['Category'].apply(lambda x: 1 if x == 'spam' else 0)
# Updated new dataset
print(df.head())
#exporting the clean data frame
df.to_csv('./data/clean_spam.csv', index=False) # `index=False` avoids saving row numbers