import pandas as pd import matplotlib.pyplot as plt import seaborn as sns import re import os import numpy as np # Load dataset df = pd.read_csv('mail_data.csv', names=['Category', 'Message'], header=None, skiprows=1) # Basic info print("Dataset Shape:", df.shape) print("\nValue Counts:\n", df['Category'].value_counts()) print("\nMissing Values:\n", df.isnull().sum()) # Add message length df['Length'] = df['Message'].apply(len) # Visualizations plt.figure(figsize=(18, 10)) # 1. Class Distribution plt.subplot(3, 2, 1) sns.countplot(x='Category', data=df) plt.title('Class Distribution (Spam vs Ham)') # 2. Message Length Distribution plt.subplot(3, 2, 2) sns.histplot(data=df, x='Length', hue='Category', bins=50, kde=True) plt.title('Message Length Distribution') # Sample messages print("\nSample Ham:") print(df[df['Category'] == 'ham']['Message'].iloc[0]) print("\nSample Spam:") print(df[df['Category'] == 'spam']['Message'].iloc[0]) #Let's check if there are other characteristic in our text data that separates spam from ham #Word counts (how often different words are used) def get_word_count(messages_series: pd.Series)->tuple: word_list = [] for message in messages_series: clean_msg = re.sub("[\.|\?|,|\!]+", "", message).lower() clean_msg = re.sub("\ \ +", " ", clean_msg).split(" ") clean_msg = [word for word in clean_msg if word!=""] word_list+=clean_msg word_count = pd.Series(word_list).value_counts(normalize=True) return word_count spam_word_count = get_word_count(df[df['Category'] == 'spam']['Message']) ham_word_count = get_word_count(df[df['Category'] == 'ham']['Message']) #TODO: Compare the words frequencies to find the ones used more often in spam emails compared to ham emails. words_list = list(set(spam_word_count.index)&set(ham_word_count.index)) wordcount_distance_spam_ham = [] for i in words_list: if i in spam_word_count: spam_count_i = spam_word_count[i] else: spam_count_i = 0 if i in ham_word_count: ham_count_i = ham_word_count[i] else: ham_count_i = 0 wordcount_distance_spam_ham.append((ham_count_i-spam_count_i)) wordcount_distance_spam_ham = pd.Series(wordcount_distance_spam_ham, index=words_list) wordcount_distance_spam_ham = wordcount_distance_spam_ham.sort_values(ascending=False) print("Words more often found in normal emails than spam emails") print(wordcount_distance_spam_ham[0:10]) #Words more often present in Ham emails print("words more often found in spam emails than normal emails") print(wordcount_distance_spam_ham[-10:]) #Words more often present in spam emails #Mail Words length (numbers of words used in the message) def get_word_len(messages_series: pd.Series)->tuple: word_len_list = [] for message in messages_series: clean_msg = re.sub("[\.|\?|,|\!|0-9|]+", "", message).lower() clean_msg = re.sub("\ \ +", " ", clean_msg).split(" ") clean_msg = [word for word in clean_msg if word!=""] word_len_list.append(len(clean_msg)) len_count = pd.Series(word_len_list) return len_count df['word_len'] = get_word_len(df['Message']) plt.subplot(3, 2, 3) sns.histplot(data=df, x='word_len', hue='Category', bins=50, kde=True) plt.title('Word count Distribution') #Mail Words length (numbers of words used in the message) def get_word_len2(messages_series: pd.Series)->tuple: word_len_list = [] for message in messages_series: clean_msg = re.sub("[\.|\?|,|\!|0-9]+", "", message).lower() clean_msg = re.sub("\ \ +", " ", clean_msg).split(" ") clean_msg = [word for word in clean_msg if word!=""] total_word_length = 0 if len(clean_msg)>0: for word in clean_msg: total_word_length += len(word) avg_word_len = total_word_length/len(clean_msg) word_len_list.append(avg_word_len) else: word_len_list.append(0) len_count = pd.Series(word_len_list) return len_count df['avg_word_len'] = get_word_len2(df['Message']) plt.subplot(3, 2, 4) sns.histplot(data=df, x='avg_word_len', hue='Category', bins=50, kde=True) plt.title('Word length Distribution') #Mail Words length (numbers of words used in the message) def get_avg_sentence_len(messages_series: pd.Series)->tuple: word_len_list = [] avg_sentence_len = [] for message in messages_series: clean_msg = re.sub("\ \ +", " ", message).split(" ") sentence_number = 1 sentence_finished = False total_word_length = 0 for word in clean_msg: total_word_length += 1 if ("." in word) or ("?" in word) or ("!" in word): sentence_finished = True else: if sentence_finished: sentence_number+=1 sentence_finished=False if total_word_length > 0: avg_sentence_len.append(total_word_length/sentence_number) avg_sentence_len = pd.Series(avg_sentence_len) return avg_sentence_len df['avg_sentence_len'] = get_avg_sentence_len(df['Message']) plt.subplot(3, 2, 5) sns.histplot(data=df, x='avg_sentence_len', hue='Category', bins=50, kde=True) plt.title('Sentence length Distribution') plt.tight_layout() plt.savefig('eda_plots.png') print("\nEDA plots saved to eda_plots.png")