# Import Libraries # Importing Numpy & Pandas for data processing & data wrangling import numpy as np import pandas as pd # Importing tools for visualization import matplotlib.pyplot as plt import seaborn as sns # Import evaluation metric libraries from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, roc_curve, classification_report # Word Cloud library from wordcloud import WordCloud, STOPWORDS # Library used for data preprocessing from sklearn.feature_extraction.text import CountVectorizer # Import model selection libraries from sklearn.model_selection import train_test_split # Library used for ML Model implementation from sklearn.naive_bayes import MultinomialNB # Importing the Pipeline class from scikit-learn from sklearn.pipeline import Pipeline # Library used for ignore warnings import warnings warnings.filterwarnings('ignore') #%matplotlib inline #knowing the current path import os print(os.system("pwd")) # Load Dataset df = pd.read_csv("./data/spam.csv", encoding='ISO-8859-1') print(df.head()) # Dataset Rows & Columns count # Checking number of rows and columns of the dataset using shape print("Number of rows are: ",df.shape[0]) print("Number of columns are: ",df.shape[1]) # Dataset Info # Checking information about the dataset using info df.info() # Dataset Duplicate Value Count dup = df.duplicated().sum() print(f'number of duplicated rows are {dup}') # Missing Values/Null Values Count print(df.isnull().sum()) # Dataset Columns print(df.columns) # Dataset Describe (all columns included) print(df.describe(include= 'all').round(2)) # Check Unique Values for each variable using a for loop. for i in df.columns.tolist(): print("No. of unique values in",i,"is",df[i].nunique()) # Change the v1 & v2 columns as Category and Message df.rename(columns={"v1": "Category", "v2": "Message"}, inplace=True) # Removing the all unnamed columns (its include much number of missing values) df.drop(columns={'Unnamed: 2','Unnamed: 3','Unnamed: 4'}, inplace=True) # Create a binary 'Spam' column: 1 for 'spam' and 0 for 'ham', based on the 'Category' column. df['Spam'] = df['Category'].apply(lambda x: 1 if x == 'spam' else 0) # Updated new dataset print(df.head()) #exporting the clean data frame df.to_csv('./data/clean_spam.csv', index=False) # `index=False` avoids saving row numbers