File size: 2,306 Bytes
135aa7a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
import pandas as pd
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
import pickle

class SpamClassifier:
    def __init__(self):
        self.vectorizer = TfidfVectorizer(stop_words='english')
        self.model = MultinomialNB()

    def import_datasets(self, path="datasets/email.csv") -> pd.DataFrame:
        df = pd.read_csv(path)
        return df

    def preprocess(self, df: pd.DataFrame) -> pd.DataFrame:
        value = df.iloc[-1, 0]
        df.drop(df[df['Category'] == value].index, inplace=True)
        df['Message'] = df['Message'].str.lower().str.replace(r'[^\w\s]', '', regex=True)
        df['Category'] = df['Category'].map({'ham': 0, 'spam': 1})
        return df

    def train(self, X_train, y_train):
        X_train_tfidf = self.vectorizer.fit_transform(X_train)
        self.model.fit(X_train_tfidf, y_train)

    def evaluate(self, X_test, y_test):
        X_test_tfidf = self.vectorizer.transform(X_test)
        y_pred = self.model.predict(X_test_tfidf)
        acc = accuracy_score(y_test, y_pred)
        print(f"Accuracy: {acc * 100:.2f}%")
        print(classification_report(y_test, y_pred, target_names=['ham', 'spam']))
        cm = confusion_matrix(y_test, y_pred)
        sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=['ham', 'spam'], yticklabels=['ham', 'spam'])
        plt.xlabel('Predicted')
        plt.ylabel('True')
        plt.title('Confusion Matrix')
        plt.show()

    def predict(self, text):
        text_tfidf = self.vectorizer.transform([text])
        prediction = self.model.predict(text_tfidf)
        return 'spam' if prediction == 1 else 'ham'

    def export(self, model_filename='spam_model.pkl', vectorizer_filename='vectorizer.pkl'): 
        with open(model_filename, 'wb') as model_file:
            pickle.dump(self.model, model_file)
        
        with open(vectorizer_filename, 'wb') as vectorizer_file:
            pickle.dump(self.vectorizer, vectorizer_file)
        
        print(f"Model and vectorizer exported as {model_filename} and {vectorizer_filename}")