deepl-project / eda_script.py
picket-cliff's picture
Upload 36 files
ca1226e verified
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import re
import os
import numpy as np
# Load dataset
df = pd.read_csv('mail_data.csv', names=['Category', 'Message'], header=None, skiprows=1)
# Basic info
print("Dataset Shape:", df.shape)
print("\nValue Counts:\n", df['Category'].value_counts())
print("\nMissing Values:\n", df.isnull().sum())
# Add message length
df['Length'] = df['Message'].apply(len)
# Visualizations
plt.figure(figsize=(18, 10))
# 1. Class Distribution
plt.subplot(3, 2, 1)
sns.countplot(x='Category', data=df)
plt.title('Class Distribution (Spam vs Ham)')
# 2. Message Length Distribution
plt.subplot(3, 2, 2)
sns.histplot(data=df, x='Length', hue='Category', bins=50, kde=True)
plt.title('Message Length Distribution')
# Sample messages
print("\nSample Ham:")
print(df[df['Category'] == 'ham']['Message'].iloc[0])
print("\nSample Spam:")
print(df[df['Category'] == 'spam']['Message'].iloc[0])
#Let's check if there are other characteristic in our text data that separates spam from ham
#Word counts (how often different words are used)
def get_word_count(messages_series: pd.Series)->tuple:
word_list = []
for message in messages_series:
clean_msg = re.sub("[\.|\?|,|\!]+", "", message).lower()
clean_msg = re.sub("\ \ +", " ", clean_msg).split(" ")
clean_msg = [word for word in clean_msg if word!=""]
word_list+=clean_msg
word_count = pd.Series(word_list).value_counts(normalize=True)
return word_count
spam_word_count = get_word_count(df[df['Category'] == 'spam']['Message'])
ham_word_count = get_word_count(df[df['Category'] == 'ham']['Message'])
#TODO: Compare the words frequencies to find the ones used more often in spam emails compared to ham emails.
words_list = list(set(spam_word_count.index)&set(ham_word_count.index))
wordcount_distance_spam_ham = []
for i in words_list:
if i in spam_word_count:
spam_count_i = spam_word_count[i]
else:
spam_count_i = 0
if i in ham_word_count:
ham_count_i = ham_word_count[i]
else:
ham_count_i = 0
wordcount_distance_spam_ham.append((ham_count_i-spam_count_i))
wordcount_distance_spam_ham = pd.Series(wordcount_distance_spam_ham, index=words_list)
wordcount_distance_spam_ham = wordcount_distance_spam_ham.sort_values(ascending=False)
print("Words more often found in normal emails than spam emails")
print(wordcount_distance_spam_ham[0:10]) #Words more often present in Ham emails
print("words more often found in spam emails than normal emails")
print(wordcount_distance_spam_ham[-10:]) #Words more often present in spam emails
#Mail Words length (numbers of words used in the message)
def get_word_len(messages_series: pd.Series)->tuple:
word_len_list = []
for message in messages_series:
clean_msg = re.sub("[\.|\?|,|\!|0-9|]+", "", message).lower()
clean_msg = re.sub("\ \ +", " ", clean_msg).split(" ")
clean_msg = [word for word in clean_msg if word!=""]
word_len_list.append(len(clean_msg))
len_count = pd.Series(word_len_list)
return len_count
df['word_len'] = get_word_len(df['Message'])
plt.subplot(3, 2, 3)
sns.histplot(data=df, x='word_len', hue='Category', bins=50, kde=True)
plt.title('Word count Distribution')
#Mail Words length (numbers of words used in the message)
def get_word_len2(messages_series: pd.Series)->tuple:
word_len_list = []
for message in messages_series:
clean_msg = re.sub("[\.|\?|,|\!|0-9]+", "", message).lower()
clean_msg = re.sub("\ \ +", " ", clean_msg).split(" ")
clean_msg = [word for word in clean_msg if word!=""]
total_word_length = 0
if len(clean_msg)>0:
for word in clean_msg:
total_word_length += len(word)
avg_word_len = total_word_length/len(clean_msg)
word_len_list.append(avg_word_len)
else:
word_len_list.append(0)
len_count = pd.Series(word_len_list)
return len_count
df['avg_word_len'] = get_word_len2(df['Message'])
plt.subplot(3, 2, 4)
sns.histplot(data=df, x='avg_word_len', hue='Category', bins=50, kde=True)
plt.title('Word length Distribution')
#Mail Words length (numbers of words used in the message)
def get_avg_sentence_len(messages_series: pd.Series)->tuple:
word_len_list = []
avg_sentence_len = []
for message in messages_series:
clean_msg = re.sub("\ \ +", " ", message).split(" ")
sentence_number = 1
sentence_finished = False
total_word_length = 0
for word in clean_msg:
total_word_length += 1
if ("." in word) or ("?" in word) or ("!" in word):
sentence_finished = True
else:
if sentence_finished:
sentence_number+=1
sentence_finished=False
if total_word_length > 0:
avg_sentence_len.append(total_word_length/sentence_number)
avg_sentence_len = pd.Series(avg_sentence_len)
return avg_sentence_len
df['avg_sentence_len'] = get_avg_sentence_len(df['Message'])
plt.subplot(3, 2, 5)
sns.histplot(data=df, x='avg_sentence_len', hue='Category', bins=50, kde=True)
plt.title('Sentence length Distribution')
plt.tight_layout()
plt.savefig('eda_plots.png')
print("\nEDA plots saved to eda_plots.png")