Upload 36 files

ca1226e verified 3 months ago

5.36 kB

	import pandas as pd
	import matplotlib.pyplot as plt
	import seaborn as sns
	import re
	import os
	import numpy as np

	# Load dataset
	df = pd.read_csv('mail_data.csv', names=['Category', 'Message'], header=None, skiprows=1)

	# Basic info
	print("Dataset Shape:", df.shape)
	print("\nValue Counts:\n", df['Category'].value_counts())
	print("\nMissing Values:\n", df.isnull().sum())

	# Add message length
	df['Length'] = df['Message'].apply(len)

	# Visualizations
	plt.figure(figsize=(18, 10))

	# 1. Class Distribution
	plt.subplot(3, 2, 1)
	sns.countplot(x='Category', data=df)
	plt.title('Class Distribution (Spam vs Ham)')

	# 2. Message Length Distribution
	plt.subplot(3, 2, 2)
	sns.histplot(data=df, x='Length', hue='Category', bins=50, kde=True)
	plt.title('Message Length Distribution')

	# Sample messages
	print("\nSample Ham:")
	print(df[df['Category'] == 'ham']['Message'].iloc[0])
	print("\nSample Spam:")
	print(df[df['Category'] == 'spam']['Message'].iloc[0])

	#Let's check if there are other characteristic in our text data that separates spam from ham

	#Word counts (how often different words are used)
	def get_word_count(messages_series: pd.Series)->tuple:
	word_list = []
	for message in messages_series:
	clean_msg = re.sub("[\.\|\?\|,\|\!]+", "", message).lower()
	clean_msg = re.sub("\ \ +", " ", clean_msg).split(" ")
	clean_msg = [word for word in clean_msg if word!=""]
	word_list+=clean_msg

	word_count = pd.Series(word_list).value_counts(normalize=True)
	return word_count

	spam_word_count = get_word_count(df[df['Category'] == 'spam']['Message'])
	ham_word_count = get_word_count(df[df['Category'] == 'ham']['Message'])

	#TODO: Compare the words frequencies to find the ones used more often in spam emails compared to ham emails.

	words_list = list(set(spam_word_count.index)&set(ham_word_count.index))
	wordcount_distance_spam_ham = []
	for i in words_list:
	if i in spam_word_count:
	spam_count_i = spam_word_count[i]
	else:
	spam_count_i = 0
	if i in ham_word_count:
	ham_count_i = ham_word_count[i]
	else:
	ham_count_i = 0
	wordcount_distance_spam_ham.append((ham_count_i-spam_count_i))
	wordcount_distance_spam_ham = pd.Series(wordcount_distance_spam_ham, index=words_list)
	wordcount_distance_spam_ham = wordcount_distance_spam_ham.sort_values(ascending=False)
	print("Words more often found in normal emails than spam emails")
	print(wordcount_distance_spam_ham[0:10]) #Words more often present in Ham emails
	print("words more often found in spam emails than normal emails")
	print(wordcount_distance_spam_ham[-10:]) #Words more often present in spam emails

	#Mail Words length (numbers of words used in the message)
	def get_word_len(messages_series: pd.Series)->tuple:
	word_len_list = []
	for message in messages_series:
	clean_msg = re.sub("[\.\|\?\|,\|\!\|0-9\|]+", "", message).lower()
	clean_msg = re.sub("\ \ +", " ", clean_msg).split(" ")
	clean_msg = [word for word in clean_msg if word!=""]
	word_len_list.append(len(clean_msg))

	len_count = pd.Series(word_len_list)
	return len_count

	df['word_len'] = get_word_len(df['Message'])

	plt.subplot(3, 2, 3)
	sns.histplot(data=df, x='word_len', hue='Category', bins=50, kde=True)
	plt.title('Word count Distribution')

	#Mail Words length (numbers of words used in the message)
	def get_word_len2(messages_series: pd.Series)->tuple:
	word_len_list = []
	for message in messages_series:
	clean_msg = re.sub("[\.\|\?\|,\|\!\|0-9]+", "", message).lower()
	clean_msg = re.sub("\ \ +", " ", clean_msg).split(" ")
	clean_msg = [word for word in clean_msg if word!=""]
	total_word_length = 0
	if len(clean_msg)>0:
	for word in clean_msg:
	total_word_length += len(word)
	avg_word_len = total_word_length/len(clean_msg)
	word_len_list.append(avg_word_len)
	else:
	word_len_list.append(0)

	len_count = pd.Series(word_len_list)
	return len_count

	df['avg_word_len'] = get_word_len2(df['Message'])

	plt.subplot(3, 2, 4)
	sns.histplot(data=df, x='avg_word_len', hue='Category', bins=50, kde=True)
	plt.title('Word length Distribution')

	#Mail Words length (numbers of words used in the message)
	def get_avg_sentence_len(messages_series: pd.Series)->tuple:
	word_len_list = []
	avg_sentence_len = []
	for message in messages_series:
	clean_msg = re.sub("\ \ +", " ", message).split(" ")
	sentence_number = 1
	sentence_finished = False
	total_word_length = 0
	for word in clean_msg:
	total_word_length += 1
	if ("." in word) or ("?" in word) or ("!" in word):
	sentence_finished = True
	else:
	if sentence_finished:
	sentence_number+=1
	sentence_finished=False
	if total_word_length > 0:
	avg_sentence_len.append(total_word_length/sentence_number)
	avg_sentence_len = pd.Series(avg_sentence_len)
	return avg_sentence_len

	df['avg_sentence_len'] = get_avg_sentence_len(df['Message'])

	plt.subplot(3, 2, 5)
	sns.histplot(data=df, x='avg_sentence_len', hue='Category', bins=50, kde=True)
	plt.title('Sentence length Distribution')



	plt.tight_layout()
	plt.savefig('eda_plots.png')
	print("\nEDA plots saved to eda_plots.png")