| import pandas as pd |
| import matplotlib.pyplot as plt |
| import seaborn as sns |
| import re |
| import os |
| import numpy as np |
|
|
| |
| df = pd.read_csv('mail_data.csv', names=['Category', 'Message'], header=None, skiprows=1) |
|
|
| |
| print("Dataset Shape:", df.shape) |
| print("\nValue Counts:\n", df['Category'].value_counts()) |
| print("\nMissing Values:\n", df.isnull().sum()) |
|
|
| |
| df['Length'] = df['Message'].apply(len) |
|
|
| |
| plt.figure(figsize=(18, 10)) |
|
|
| |
| plt.subplot(3, 2, 1) |
| sns.countplot(x='Category', data=df) |
| plt.title('Class Distribution (Spam vs Ham)') |
|
|
| |
| plt.subplot(3, 2, 2) |
| sns.histplot(data=df, x='Length', hue='Category', bins=50, kde=True) |
| plt.title('Message Length Distribution') |
|
|
| |
| print("\nSample Ham:") |
| print(df[df['Category'] == 'ham']['Message'].iloc[0]) |
| print("\nSample Spam:") |
| print(df[df['Category'] == 'spam']['Message'].iloc[0]) |
|
|
| |
|
|
| |
| def get_word_count(messages_series: pd.Series)->tuple: |
| word_list = [] |
| for message in messages_series: |
| clean_msg = re.sub("[\.|\?|,|\!]+", "", message).lower() |
| clean_msg = re.sub("\ \ +", " ", clean_msg).split(" ") |
| clean_msg = [word for word in clean_msg if word!=""] |
| word_list+=clean_msg |
|
|
| word_count = pd.Series(word_list).value_counts(normalize=True) |
| return word_count |
|
|
| spam_word_count = get_word_count(df[df['Category'] == 'spam']['Message']) |
| ham_word_count = get_word_count(df[df['Category'] == 'ham']['Message']) |
|
|
| |
|
|
| words_list = list(set(spam_word_count.index)&set(ham_word_count.index)) |
| wordcount_distance_spam_ham = [] |
| for i in words_list: |
| if i in spam_word_count: |
| spam_count_i = spam_word_count[i] |
| else: |
| spam_count_i = 0 |
| if i in ham_word_count: |
| ham_count_i = ham_word_count[i] |
| else: |
| ham_count_i = 0 |
| wordcount_distance_spam_ham.append((ham_count_i-spam_count_i)) |
| wordcount_distance_spam_ham = pd.Series(wordcount_distance_spam_ham, index=words_list) |
| wordcount_distance_spam_ham = wordcount_distance_spam_ham.sort_values(ascending=False) |
| print("Words more often found in normal emails than spam emails") |
| print(wordcount_distance_spam_ham[0:10]) |
| print("words more often found in spam emails than normal emails") |
| print(wordcount_distance_spam_ham[-10:]) |
|
|
| |
| def get_word_len(messages_series: pd.Series)->tuple: |
| word_len_list = [] |
| for message in messages_series: |
| clean_msg = re.sub("[\.|\?|,|\!|0-9|]+", "", message).lower() |
| clean_msg = re.sub("\ \ +", " ", clean_msg).split(" ") |
| clean_msg = [word for word in clean_msg if word!=""] |
| word_len_list.append(len(clean_msg)) |
|
|
| len_count = pd.Series(word_len_list) |
| return len_count |
|
|
| df['word_len'] = get_word_len(df['Message']) |
|
|
| plt.subplot(3, 2, 3) |
| sns.histplot(data=df, x='word_len', hue='Category', bins=50, kde=True) |
| plt.title('Word count Distribution') |
|
|
| |
| def get_word_len2(messages_series: pd.Series)->tuple: |
| word_len_list = [] |
| for message in messages_series: |
| clean_msg = re.sub("[\.|\?|,|\!|0-9]+", "", message).lower() |
| clean_msg = re.sub("\ \ +", " ", clean_msg).split(" ") |
| clean_msg = [word for word in clean_msg if word!=""] |
| total_word_length = 0 |
| if len(clean_msg)>0: |
| for word in clean_msg: |
| total_word_length += len(word) |
| avg_word_len = total_word_length/len(clean_msg) |
| word_len_list.append(avg_word_len) |
| else: |
| word_len_list.append(0) |
|
|
| len_count = pd.Series(word_len_list) |
| return len_count |
|
|
| df['avg_word_len'] = get_word_len2(df['Message']) |
|
|
| plt.subplot(3, 2, 4) |
| sns.histplot(data=df, x='avg_word_len', hue='Category', bins=50, kde=True) |
| plt.title('Word length Distribution') |
|
|
| |
| def get_avg_sentence_len(messages_series: pd.Series)->tuple: |
| word_len_list = [] |
| avg_sentence_len = [] |
| for message in messages_series: |
| clean_msg = re.sub("\ \ +", " ", message).split(" ") |
| sentence_number = 1 |
| sentence_finished = False |
| total_word_length = 0 |
| for word in clean_msg: |
| total_word_length += 1 |
| if ("." in word) or ("?" in word) or ("!" in word): |
| sentence_finished = True |
| else: |
| if sentence_finished: |
| sentence_number+=1 |
| sentence_finished=False |
| if total_word_length > 0: |
| avg_sentence_len.append(total_word_length/sentence_number) |
| avg_sentence_len = pd.Series(avg_sentence_len) |
| return avg_sentence_len |
|
|
| df['avg_sentence_len'] = get_avg_sentence_len(df['Message']) |
|
|
| plt.subplot(3, 2, 5) |
| sns.histplot(data=df, x='avg_sentence_len', hue='Category', bins=50, kde=True) |
| plt.title('Sentence length Distribution') |
|
|
|
|
|
|
| plt.tight_layout() |
| plt.savefig('eda_plots.png') |
| print("\nEDA plots saved to eda_plots.png") |
|
|
|
|