spam-classifier-app / models /spam_model.py
kimlay1's picture
Upload 15 files
4b30156 verified
import math
import re
import dill
from collections import Counter
class SpamNaiveBayes:
def __init__(self, alpha=1):
self.alpha = alpha
self.vocab = set()
self.log_spam = {}
self.log_ham = {}
self.P_spam = 0
self.P_ham = 0
self.unk_spam = 0
self.unk_ham = 0
def tokenize(self, text):
return re.findall(r"\w+|[!?.]", str(text).lower())
def train(self, texts, labels):
# Build Vocab
for t in texts:
self.vocab.update(self.tokenize(t))
self.vocab = sorted(self.vocab)
# Counts
wc_spam = Counter()
wc_ham = Counter()
spam_docs = sum(1 for l in labels if l == 1)
ham_docs = len(labels) - spam_docs
total_docs = len(labels)
for txt, lab in zip(texts, labels):
toks = self.tokenize(txt)
if lab == 1:
wc_spam.update(toks)
else:
wc_ham.update(toks)
# Calculate Probabilities
self.P_spam = spam_docs / total_docs
self.P_ham = ham_docs / total_docs
V = len(self.vocab)
total_spam = sum(wc_spam.values()) + self.alpha * V
total_ham = sum(wc_ham.values()) + self.alpha * V
self.log_spam = {w: math.log((wc_spam[w] + self.alpha) / total_spam) for w in self.vocab}
self.log_ham = {w: math.log((wc_ham[w] + self.alpha) / total_ham) for w in self.vocab}
self.unk_spam = math.log(self.alpha / total_spam)
self.unk_ham = math.log(self.alpha / total_ham)
print("Training Complete.")
def predict(self, text):
toks = self.tokenize(text)
s_spam = math.log(self.P_spam + 1e-12)
s_ham = math.log(self.P_ham + 1e-12)
for t in toks:
s_spam += self.log_spam.get(t, self.unk_spam)
s_ham += self.log_ham.get(t, self.unk_ham)
return 1 if s_spam > s_ham else 0
if __name__ == "__main__":
from datasets import load_dataset
print("Loading data...")
ds = load_dataset("mshenoda/spam-messages")
texts = [x['text'] for x in ds['train']]
labels = []
for x in ds['train']:
lab = x['label']
if isinstance(lab, str):
labels.append(1 if lab.lower() in ['spam', '1'] else 0)
else:
labels.append(int(lab))
print("Training clean model...")
model = SpamNaiveBayes()
model.train(texts, labels)
with open("model_nb_clean.pkl", "wb") as f:
dill.dump(model, f)
print("✅ Success! 'model_nb_clean.pkl' created. Upload this file to Hugging Face.")