Spaces:

kimlay1
/

spam-classifier-app

Sleeping

App Files Files Community

kimlay1 commited on Nov 29, 2025

Commit

4b30156

verified ·

1 Parent(s): ac6799a

Upload 15 files

Browse files

Files changed (15) hide show

app.py +113 -0
models/DistilBert/config.json +32 -0
models/DistilBert/model.safetensors +3 -0
models/DistilBert/special_tokens_map.json +7 -0
models/DistilBert/tokenizer.json +0 -0
models/DistilBert/tokenizer_config.json +56 -0
models/DistilBert/training_args.bin +3 -0
models/DistilBert/vocab.txt +0 -0
models/__pycache__/spam_model.cpython-311.pyc +0 -0
models/model_bilstm.pt +3 -0
models/model_cnn.pt +3 -0
models/model_nb.pkl +3 -0
models/spam_model.py +87 -0
models/vocab.json +0 -0
requirements.txt +4 -3

app.py ADDED Viewed

	@@ -0,0 +1,113 @@

+import streamlit as st
+import torch
+import dill
+import json
+import re
+from transformers import AutoTokenizer, AutoModelForSequenceClassification
+import time
+from models.spam_model import SpamNaiveBayes
+# use cpu
+device = torch.device("cpu")
+# for tokenizing text
+def tokenize(text):
+    return re.findall(r"\w+|[!?.]", str(text).lower())
+# for encoding text
+def encode_text(text, vocab, max_len=40):
+    toks = tokenize(text)
+    ids = [vocab.get(t, 1) for t in toks[:max_len]]
+    return ids + [0]*(max_len - len(ids))
+# function for loading all 4 models
+@st.cache_resource # cache so it doesn't reload everytime
+def load_models():
+    # load vocab for CNN and BiLSTM
+    with open('./models/vocab.json', 'r') as f:
+        vocab = json.load(f)
+    # load Naive Bayes
+    with open('./models/model_nb.pkl', 'rb') as f:
+        nb_model = dill.load(f)
+    # load CNN and BiLSTM
+    cnn_model = torch.jit.load("./models/model_cnn.pt", map_location=device)
+    lstm_model = torch.jit.load("./models/model_bilstm.pt", map_location=device)
+    # load distilBert
+    bert_tokenizer = AutoTokenizer.from_pretrained("./models/DistilBert")
+    bert_model = AutoModelForSequenceClassification.from_pretrained("./models/DistilBert")
+    return vocab, nb_model, cnn_model, lstm_model, bert_tokenizer, bert_model
+# load everything
+try:
+    vocab, nb_model, cnn, lstm, bert_tok, bert = load_models()
+    st.toast("System Ready!", icon="✅")
+except Exception as e:
+    st.error(f"Failed to load models. Error: {e}")
+    st.stop()
+## Streamlit Logic
+st.title("Spam Message Classifier")
+st.markdown("Compare 4 different AI architectures on the same message.")
+# textbox
+text = st.text_area("Enter Message:", "Congratulations! You've won a $1000 Walmart gift card. Click here to claim.")
+# sidebar
+with st.sidebar:
+    st.header("About Project")
+    st.write("The goal of this project is to compare Traditional Machine Learning vs. Deep Learning models for text classification.")
+    st.divider()
+    st.link_button("Dataset", "https://huggingface.co/datasets/mshenoda/spam-messages")
+if st.button("Analyze Message", type="primary"):
+    col1, col2 = st.columns(2)
+    col3, col4 = st.columns(2)
+    # Naive Bayes
+    start = time.time()
+    nb_res = nb_model.predict(text)
+    end = time.time()
+    lbl = "SPAM" if nb_res == 1 else "HAM"
+    col1.metric("Naive Bayes", lbl, f"{(end-start)*1000:.1f} ms")
+    # Prepare for CNN and LSTM
+    input_ids = torch.tensor([encode_text(text, vocab)]).to(device)
+    # CNN
+    start = time.time()
+    with torch.no_grad():
+        cnn_res = cnn(input_ids).argmax(1).item()
+    end = time.time()
+    lbl = "SPAM" if cnn_res == 1 else "HAM"
+    col2.metric("CNN", lbl, f"{(end-start)*1000:.1f} ms")
+    # BiLSTM
+    start = time.time()
+    with torch.no_grad():
+        lstm_res = lstm(input_ids).argmax(1).item()
+    end = time.time()
+    lbl = "SPAM" if lstm_res == 1 else "HAM"
+    col3.metric("BiLSTM", lbl, f"{(end-start)*1000:.1f} ms")
+    # DistilBert
+    start = time.time()
+    inputs = bert_tok(text, return_tensors="pt", padding=True, truncation=True).to(device)
+    with torch.no_grad():
+        logits = bert(**inputs).logits
+        bert_res = logits.argmax().item()
+    end = time.time()
+    lbl = "SPAM" if bert_res == 1 else "HAM"
+    col4.metric("DistilBERT", lbl, f"{(end-start)*1000:.1f} ms")
+with st.expander("View Model Details"):
+    st.markdown("""
+    * **Naive Bayes:** A traditional **Machine Learning** model that uses probability statistics (Bayes' Theorem) to predict spam based on simple word counts.
+    * **CNN:** A **Deep Learning** model that uses sliding "filters" to detect specific patterns of words (like "free prize"), similar to how it detects edges in images.
+    * **BiLSTM:** A **Recurrent Neural Network (RNN)** that reads the message forwards and backwards simultaneously to understand the context and sequence of words.
+    * **DistilBERT:** A **Transformer** model that uses "Self-Attention" to understand the complex meaning and relationship between every word in the sentence.
+    """)

models/DistilBert/config.json ADDED Viewed

	@@ -0,0 +1,32 @@

+{
+  "activation": "gelu",
+  "architectures": [
+    "DistilBertForSequenceClassification"
+  ],
+  "attention_dropout": 0.1,
+  "dim": 768,
+  "dropout": 0.1,
+  "hidden_dim": 3072,
+  "id2label": {
+    "0": "HAM",
+    "1": "SPAM"
+  },
+  "initializer_range": 0.02,
+  "label2id": {
+    "HAM": 0,
+    "SPAM": 1
+  },
+  "max_position_embeddings": 512,
+  "model_type": "distilbert",
+  "n_heads": 12,
+  "n_layers": 6,
+  "pad_token_id": 0,
+  "problem_type": "single_label_classification",
+  "qa_dropout": 0.1,
+  "seq_classif_dropout": 0.2,
+  "sinusoidal_pos_embds": false,
+  "tie_weights_": true,
+  "torch_dtype": "float32",
+  "transformers_version": "4.51.3",
+  "vocab_size": 30522
+}

models/DistilBert/model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:befff6c44b2d61855b90616e026f2b66d99210ae762b3bb52055b8bcfb047fba
+size 267832560

models/DistilBert/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,7 @@

+{
+  "cls_token": "[CLS]",
+  "mask_token": "[MASK]",
+  "pad_token": "[PAD]",
+  "sep_token": "[SEP]",
+  "unk_token": "[UNK]"
+}

models/DistilBert/tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

models/DistilBert/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,56 @@

+{
+  "added_tokens_decoder": {
+    "0": {
+      "content": "[PAD]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "100": {
+      "content": "[UNK]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "101": {
+      "content": "[CLS]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "102": {
+      "content": "[SEP]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "103": {
+      "content": "[MASK]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "clean_up_tokenization_spaces": false,
+  "cls_token": "[CLS]",
+  "do_lower_case": true,
+  "extra_special_tokens": {},
+  "mask_token": "[MASK]",
+  "model_max_length": 512,
+  "pad_token": "[PAD]",
+  "sep_token": "[SEP]",
+  "strip_accents": null,
+  "tokenize_chinese_chars": true,
+  "tokenizer_class": "DistilBertTokenizer",
+  "unk_token": "[UNK]"
+}

models/DistilBert/training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:de5de394762643c9f549cfbd77db0ee77f88bc3c4b8567af754d79287de02681
+size 5713

models/DistilBert/vocab.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

models/__pycache__/spam_model.cpython-311.pyc ADDED Viewed

Binary file (6.18 kB). View file

models/model_bilstm.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b72e135a2ad091eda1c9502b5f9e8bdcdf5e57b4da3bd67fb89a0ceafe0c1596
+size 66132208

models/model_cnn.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:eaaf0fd1e07cb80ad08ebba4040533a979c2c4689c680bc4e174aeb2e4a1a2e2
+size 65668371

models/model_nb.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3bfcf52d24f845ec808ddfe0c2cdbcef512508ed3990ad6c900024eaaa8603cd
+size 6348039

models/spam_model.py ADDED Viewed

	@@ -0,0 +1,87 @@

+import math
+import re
+import dill
+from collections import Counter
+class SpamNaiveBayes:
+    def __init__(self, alpha=1):
+        self.alpha = alpha
+        self.vocab = set()
+        self.log_spam = {}
+        self.log_ham = {}
+        self.P_spam = 0
+        self.P_ham = 0
+        self.unk_spam = 0
+        self.unk_ham = 0
+    def tokenize(self, text):
+        return re.findall(r"\w+|[!?.]", str(text).lower())
+    def train(self, texts, labels):
+        # Build Vocab
+        for t in texts:
+            self.vocab.update(self.tokenize(t))
+        self.vocab = sorted(self.vocab)
+        # Counts
+        wc_spam = Counter()
+        wc_ham = Counter()
+        spam_docs = sum(1 for l in labels if l == 1)
+        ham_docs = len(labels) - spam_docs
+        total_docs = len(labels)
+        for txt, lab in zip(texts, labels):
+            toks = self.tokenize(txt)
+            if lab == 1:
+                wc_spam.update(toks)
+            else:
+                wc_ham.update(toks)
+        # Calculate Probabilities
+        self.P_spam = spam_docs / total_docs
+        self.P_ham = ham_docs / total_docs
+        V = len(self.vocab)
+        total_spam = sum(wc_spam.values()) + self.alpha * V
+        total_ham = sum(wc_ham.values()) + self.alpha * V
+        self.log_spam = {w: math.log((wc_spam[w] + self.alpha) / total_spam) for w in self.vocab}
+        self.log_ham = {w: math.log((wc_ham[w] + self.alpha) / total_ham) for w in self.vocab}
+        self.unk_spam = math.log(self.alpha / total_spam)
+        self.unk_ham = math.log(self.alpha / total_ham)
+        print("Training Complete.")
+    def predict(self, text):
+        toks = self.tokenize(text)
+        s_spam = math.log(self.P_spam + 1e-12)
+        s_ham = math.log(self.P_ham + 1e-12)
+        for t in toks:
+            s_spam += self.log_spam.get(t, self.unk_spam)
+            s_ham += self.log_ham.get(t, self.unk_ham)
+        return 1 if s_spam > s_ham else 0
+if __name__ == "__main__":
+    from datasets import load_dataset
+    print("Loading data...")
+    ds = load_dataset("mshenoda/spam-messages")
+    texts = [x['text'] for x in ds['train']]
+    labels = []
+    for x in ds['train']:
+        lab = x['label']
+        if isinstance(lab, str):
+            labels.append(1 if lab.lower() in ['spam', '1'] else 0)
+        else:
+            labels.append(int(lab))
+    print("Training clean model...")
+    model = SpamNaiveBayes()
+    model.train(texts, labels)
+    with open("model_nb_clean.pkl", "wb") as f:
+        dill.dump(model, f)
+    print("✅ Success! 'model_nb_clean.pkl' created. Upload this file to Hugging Face.")

models/vocab.json ADDED Viewed

The diff for this file is too large to render. See raw diff

requirements.txt CHANGED Viewed

@@ -1,3 +1,4 @@
-altair
-pandas
-streamlit

+streamlit
+torch
+transformers
+dill