dangminh214
copy from github repo
135aa7a
import pandas as pd
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
class SpamClassifier:
def __init__(self):
self.vectorizer = TfidfVectorizer(stop_words='english')
self.model = MultinomialNB()
def import_datasets(self, path="datasets/email.csv") -> pd.DataFrame:
df = pd.read_csv(path)
return df
def preprocess(self, df: pd.DataFrame) -> pd.DataFrame:
value = df.iloc[-1, 0]
df.drop(df[df['Category'] == value].index, inplace=True)
df['Message'] = df['Message'].str.lower().str.replace(r'[^\w\s]', '', regex=True)
df['Category'] = df['Category'].map({'ham': 0, 'spam': 1})
return df
def train(self, X_train, y_train):
X_train_tfidf = self.vectorizer.fit_transform(X_train)
self.model.fit(X_train_tfidf, y_train)
def evaluate(self, X_test, y_test):
X_test_tfidf = self.vectorizer.transform(X_test)
y_pred = self.model.predict(X_test_tfidf)
acc = accuracy_score(y_test, y_pred)
print(f"Accuracy: {acc * 100:.2f}%")
print(classification_report(y_test, y_pred, target_names=['ham', 'spam']))
cm = confusion_matrix(y_test, y_pred)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=['ham', 'spam'], yticklabels=['ham', 'spam'])
plt.xlabel('Predicted')
plt.ylabel('True')
plt.title('Confusion Matrix')
plt.show()
def predict(self, text):
text_tfidf = self.vectorizer.transform([text])
prediction = self.model.predict(text_tfidf)
return 'spam' if prediction == 1 else 'ham'
def export(self, model_filename='spam_model.pkl', vectorizer_filename='vectorizer.pkl'):
with open(model_filename, 'wb') as model_file:
pickle.dump(self.model, model_file)
with open(vectorizer_filename, 'wb') as vectorizer_file:
pickle.dump(self.vectorizer, vectorizer_file)
print(f"Model and vectorizer exported as {model_filename} and {vectorizer_filename}")