import pandas as pd from sklearn.naive_bayes import MultinomialNB from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.model_selection import train_test_split from sklearn.metrics import classification_report, accuracy_score, confusion_matrix import matplotlib.pyplot as plt import seaborn as sns import pickle class SpamClassifier: def __init__(self): self.vectorizer = TfidfVectorizer(stop_words='english') self.model = MultinomialNB() def import_datasets(self, path="datasets/email.csv") -> pd.DataFrame: df = pd.read_csv(path) return df def preprocess(self, df: pd.DataFrame) -> pd.DataFrame: value = df.iloc[-1, 0] df.drop(df[df['Category'] == value].index, inplace=True) df['Message'] = df['Message'].str.lower().str.replace(r'[^\w\s]', '', regex=True) df['Category'] = df['Category'].map({'ham': 0, 'spam': 1}) return df def train(self, X_train, y_train): X_train_tfidf = self.vectorizer.fit_transform(X_train) self.model.fit(X_train_tfidf, y_train) def evaluate(self, X_test, y_test): X_test_tfidf = self.vectorizer.transform(X_test) y_pred = self.model.predict(X_test_tfidf) acc = accuracy_score(y_test, y_pred) print(f"Accuracy: {acc * 100:.2f}%") print(classification_report(y_test, y_pred, target_names=['ham', 'spam'])) cm = confusion_matrix(y_test, y_pred) sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=['ham', 'spam'], yticklabels=['ham', 'spam']) plt.xlabel('Predicted') plt.ylabel('True') plt.title('Confusion Matrix') plt.show() def predict(self, text): text_tfidf = self.vectorizer.transform([text]) prediction = self.model.predict(text_tfidf) return 'spam' if prediction == 1 else 'ham' def export(self, model_filename='spam_model.pkl', vectorizer_filename='vectorizer.pkl'): with open(model_filename, 'wb') as model_file: pickle.dump(self.model, model_file) with open(vectorizer_filename, 'wb') as vectorizer_file: pickle.dump(self.vectorizer, vectorizer_file) print(f"Model and vectorizer exported as {model_filename} and {vectorizer_filename}")