|
|
|
|
|
|
|
|
import numpy as np |
|
|
import pandas as pd |
|
|
|
|
|
|
|
|
|
|
|
import matplotlib.pyplot as plt |
|
|
import seaborn as sns |
|
|
|
|
|
|
|
|
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, roc_curve, classification_report |
|
|
|
|
|
|
|
|
from wordcloud import WordCloud, STOPWORDS |
|
|
|
|
|
|
|
|
from sklearn.feature_extraction.text import CountVectorizer |
|
|
|
|
|
|
|
|
from sklearn.model_selection import train_test_split |
|
|
|
|
|
|
|
|
from sklearn.naive_bayes import MultinomialNB |
|
|
|
|
|
|
|
|
from sklearn.pipeline import Pipeline |
|
|
|
|
|
|
|
|
import warnings |
|
|
warnings.filterwarnings('ignore') |
|
|
|
|
|
|
|
|
|
|
|
import os |
|
|
print(os.system("pwd")) |
|
|
|
|
|
|
|
|
|
|
|
df = pd.read_csv("./data/spam.csv", encoding='ISO-8859-1') |
|
|
print(df.head()) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
print("Number of rows are: ",df.shape[0]) |
|
|
print("Number of columns are: ",df.shape[1]) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
df.info() |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
dup = df.duplicated().sum() |
|
|
print(f'number of duplicated rows are {dup}') |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
print(df.isnull().sum()) |
|
|
|
|
|
|
|
|
print(df.columns) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
print(df.describe(include= 'all').round(2)) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
for i in df.columns.tolist(): |
|
|
print("No. of unique values in",i,"is",df[i].nunique()) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
df.rename(columns={"v1": "Category", "v2": "Message"}, inplace=True) |
|
|
|
|
|
df.drop(columns={'Unnamed: 2','Unnamed: 3','Unnamed: 4'}, inplace=True) |
|
|
|
|
|
|
|
|
df['Spam'] = df['Category'].apply(lambda x: 1 if x == 'spam' else 0) |
|
|
|
|
|
|
|
|
print(df.head()) |
|
|
|
|
|
|
|
|
|
|
|
df.to_csv('./data/clean_spam.csv', index=False) |
|
|
|