Spaces:

kimlay1
/

spam-classifier-app

Sleeping

App Files Files Community

spam-classifier-app / models /spam_model.py

kimlay1

Upload 15 files

4b30156 verified 3 months ago

raw

history blame contribute delete

2.75 kB

	import math
	import re
	import dill
	from collections import Counter

	class SpamNaiveBayes:
	def __init__(self, alpha=1):
	self.alpha = alpha
	self.vocab = set()
	self.log_spam = {}
	self.log_ham = {}
	self.P_spam = 0
	self.P_ham = 0
	self.unk_spam = 0
	self.unk_ham = 0

	def tokenize(self, text):
	return re.findall(r"\w+\|[!?.]", str(text).lower())

	def train(self, texts, labels):
	# Build Vocab
	for t in texts:
	self.vocab.update(self.tokenize(t))
	self.vocab = sorted(self.vocab)

	# Counts
	wc_spam = Counter()
	wc_ham = Counter()
	spam_docs = sum(1 for l in labels if l == 1)
	ham_docs = len(labels) - spam_docs
	total_docs = len(labels)

	for txt, lab in zip(texts, labels):
	toks = self.tokenize(txt)
	if lab == 1:
	wc_spam.update(toks)
	else:
	wc_ham.update(toks)

	# Calculate Probabilities
	self.P_spam = spam_docs / total_docs
	self.P_ham = ham_docs / total_docs

	V = len(self.vocab)
	total_spam = sum(wc_spam.values()) + self.alpha * V
	total_ham = sum(wc_ham.values()) + self.alpha * V

	self.log_spam = {w: math.log((wc_spam[w] + self.alpha) / total_spam) for w in self.vocab}
	self.log_ham = {w: math.log((wc_ham[w] + self.alpha) / total_ham) for w in self.vocab}

	self.unk_spam = math.log(self.alpha / total_spam)
	self.unk_ham = math.log(self.alpha / total_ham)
	print("Training Complete.")

	def predict(self, text):
	toks = self.tokenize(text)
	s_spam = math.log(self.P_spam + 1e-12)
	s_ham = math.log(self.P_ham + 1e-12)

	for t in toks:
	s_spam += self.log_spam.get(t, self.unk_spam)
	s_ham += self.log_ham.get(t, self.unk_ham)

	return 1 if s_spam > s_ham else 0

	if __name__ == "__main__":
	from datasets import load_dataset

	print("Loading data...")
	ds = load_dataset("mshenoda/spam-messages")
	texts = [x['text'] for x in ds['train']]

	labels = []
	for x in ds['train']:
	lab = x['label']
	if isinstance(lab, str):
	labels.append(1 if lab.lower() in ['spam', '1'] else 0)
	else:
	labels.append(int(lab))

	print("Training clean model...")
	model = SpamNaiveBayes()
	model.train(texts, labels)

	with open("model_nb_clean.pkl", "wb") as f:
	dill.dump(model, f)
	print("✅ Success! 'model_nb_clean.pkl' created. Upload this file to Hugging Face.")