import math
import re
import dill
from collections import Counter

class SpamNaiveBayes:
    def __init__(self, alpha=1):
        self.alpha = alpha
        self.vocab = set()
        self.log_spam = {}
        self.log_ham = {}
        self.P_spam = 0
        self.P_ham = 0
        self.unk_spam = 0
        self.unk_ham = 0

    def tokenize(self, text):
        return re.findall(r"\w+|[!?.]", str(text).lower())

    def train(self, texts, labels):
        # Build Vocab
        for t in texts:
            self.vocab.update(self.tokenize(t))
        self.vocab = sorted(self.vocab)
        
        # Counts
        wc_spam = Counter()
        wc_ham = Counter()
        spam_docs = sum(1 for l in labels if l == 1)
        ham_docs = len(labels) - spam_docs
        total_docs = len(labels)

        for txt, lab in zip(texts, labels):
            toks = self.tokenize(txt)
            if lab == 1:
                wc_spam.update(toks)
            else:
                wc_ham.update(toks)

        # Calculate Probabilities
        self.P_spam = spam_docs / total_docs
        self.P_ham = ham_docs / total_docs
        
        V = len(self.vocab)
        total_spam = sum(wc_spam.values()) + self.alpha * V
        total_ham = sum(wc_ham.values()) + self.alpha * V

        self.log_spam = {w: math.log((wc_spam[w] + self.alpha) / total_spam) for w in self.vocab}
        self.log_ham = {w: math.log((wc_ham[w] + self.alpha) / total_ham) for w in self.vocab}
        
        self.unk_spam = math.log(self.alpha / total_spam)
        self.unk_ham = math.log(self.alpha / total_ham)
        print("Training Complete.")

    def predict(self, text):
        toks = self.tokenize(text)
        s_spam = math.log(self.P_spam + 1e-12)
        s_ham = math.log(self.P_ham + 1e-12)
        
        for t in toks:
            s_spam += self.log_spam.get(t, self.unk_spam)
            s_ham += self.log_ham.get(t, self.unk_ham)
            
        return 1 if s_spam > s_ham else 0

if __name__ == "__main__":
    from datasets import load_dataset
    
    print("Loading data...")
    ds = load_dataset("mshenoda/spam-messages")
    texts = [x['text'] for x in ds['train']]

    labels = []
    for x in ds['train']:
        lab = x['label']
        if isinstance(lab, str):
            labels.append(1 if lab.lower() in ['spam', '1'] else 0)
        else:
            labels.append(int(lab))

    print("Training clean model...")
    model = SpamNaiveBayes()
    model.train(texts, labels)

    with open("model_nb_clean.pkl", "wb") as f:
        dill.dump(model, f)
    print("✅ Success! 'model_nb_clean.pkl' created. Upload this file to Hugging Face.")