File size: 5,360 Bytes

ca1226e

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import re
import os
import numpy as np

# Load dataset
df = pd.read_csv('mail_data.csv', names=['Category', 'Message'], header=None, skiprows=1)

# Basic info
print("Dataset Shape:", df.shape)
print("\nValue Counts:\n", df['Category'].value_counts())
print("\nMissing Values:\n", df.isnull().sum())

# Add message length
df['Length'] = df['Message'].apply(len)

# Visualizations
plt.figure(figsize=(18, 10))

# 1. Class Distribution
plt.subplot(3, 2, 1)
sns.countplot(x='Category', data=df)
plt.title('Class Distribution (Spam vs Ham)')

# 2. Message Length Distribution
plt.subplot(3, 2, 2)
sns.histplot(data=df, x='Length', hue='Category', bins=50, kde=True)
plt.title('Message Length Distribution')

# Sample messages
print("\nSample Ham:")
print(df[df['Category'] == 'ham']['Message'].iloc[0])
print("\nSample Spam:")
print(df[df['Category'] == 'spam']['Message'].iloc[0])

#Let's check if there are other characteristic in our text data that separates spam from ham

#Word counts (how often different words are used)
def get_word_count(messages_series: pd.Series)->tuple:
    word_list = []
    for message in messages_series:
        clean_msg = re.sub("[\.|\?|,|\!]+", "", message).lower()
        clean_msg = re.sub("\ \ +", " ", clean_msg).split(" ")
        clean_msg = [word for word in clean_msg if word!=""]
        word_list+=clean_msg

    word_count = pd.Series(word_list).value_counts(normalize=True)
    return word_count

spam_word_count = get_word_count(df[df['Category'] == 'spam']['Message'])
ham_word_count = get_word_count(df[df['Category'] == 'ham']['Message'])

#TODO: Compare the words frequencies to find the ones used more often in spam emails compared to ham emails.

words_list = list(set(spam_word_count.index)&set(ham_word_count.index))
wordcount_distance_spam_ham = []
for i in words_list:
    if i in spam_word_count:
        spam_count_i = spam_word_count[i]
    else:
        spam_count_i = 0
    if i in ham_word_count:
        ham_count_i = ham_word_count[i]
    else:
        ham_count_i = 0
    wordcount_distance_spam_ham.append((ham_count_i-spam_count_i))
wordcount_distance_spam_ham = pd.Series(wordcount_distance_spam_ham, index=words_list)
wordcount_distance_spam_ham = wordcount_distance_spam_ham.sort_values(ascending=False)
print("Words more often found in normal emails than spam emails")
print(wordcount_distance_spam_ham[0:10]) #Words more often present in Ham emails
print("words more often found in spam emails than normal emails")
print(wordcount_distance_spam_ham[-10:]) #Words more often present in spam emails

#Mail Words length (numbers of words used in the message) 
def get_word_len(messages_series: pd.Series)->tuple:
    word_len_list = []
    for message in messages_series:
        clean_msg = re.sub("[\.|\?|,|\!|0-9|]+", "", message).lower()
        clean_msg = re.sub("\ \ +", " ", clean_msg).split(" ")
        clean_msg = [word for word in clean_msg if word!=""]
        word_len_list.append(len(clean_msg))

    len_count = pd.Series(word_len_list)
    return len_count

df['word_len'] = get_word_len(df['Message'])

plt.subplot(3, 2, 3)
sns.histplot(data=df, x='word_len', hue='Category', bins=50, kde=True)
plt.title('Word count Distribution')

#Mail Words length (numbers of words used in the message) 
def get_word_len2(messages_series: pd.Series)->tuple:
    word_len_list = []
    for message in messages_series:
        clean_msg = re.sub("[\.|\?|,|\!|0-9]+", "", message).lower()
        clean_msg = re.sub("\ \ +", " ", clean_msg).split(" ")
        clean_msg = [word for word in clean_msg if word!=""]
        total_word_length = 0
        if len(clean_msg)>0:
            for word in clean_msg:
                total_word_length += len(word)
            avg_word_len = total_word_length/len(clean_msg) 
            word_len_list.append(avg_word_len)
        else:
            word_len_list.append(0)

    len_count = pd.Series(word_len_list)
    return len_count

df['avg_word_len'] = get_word_len2(df['Message'])

plt.subplot(3, 2, 4)
sns.histplot(data=df, x='avg_word_len', hue='Category', bins=50, kde=True)
plt.title('Word length Distribution')

#Mail Words length (numbers of words used in the message) 
def get_avg_sentence_len(messages_series: pd.Series)->tuple:
    word_len_list = []
    avg_sentence_len = []
    for message in messages_series:
        clean_msg = re.sub("\ \ +", " ", message).split(" ")
        sentence_number = 1
        sentence_finished = False
        total_word_length = 0
        for word in clean_msg:
            total_word_length += 1
            if ("." in word) or ("?" in word) or ("!" in word):
                sentence_finished = True
            else:
                if sentence_finished:
                    sentence_number+=1
                    sentence_finished=False
        if total_word_length > 0:
            avg_sentence_len.append(total_word_length/sentence_number)
    avg_sentence_len = pd.Series(avg_sentence_len)
    return avg_sentence_len

df['avg_sentence_len'] = get_avg_sentence_len(df['Message'])

plt.subplot(3, 2, 5)
sns.histplot(data=df, x='avg_sentence_len', hue='Category', bins=50, kde=True)
plt.title('Sentence length Distribution')



plt.tight_layout()
plt.savefig('eda_plots.png')
print("\nEDA plots saved to eda_plots.png")