Text Classification
Scikit-learn
Joblib
English
sms
spam-detection
tf-idf
linear-svm
scikit-learn
Eval Results (legacy)
Instructions to use jngb-labs/sms-spam-classical with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- Scikit-learn
How to use jngb-labs/sms-spam-classical with Scikit-learn:
from huggingface_hub import hf_hub_download import joblib model = joblib.load( hf_hub_download("jngb-labs/sms-spam-classical", "sklearn_model.joblib") ) # only load pickle files from sources you trust # read more about it here https://skops.readthedocs.io/en/stable/persistence.html - Notebooks
- Google Colab
- Kaggle
| """ | |
| Feature pipeline for the SMS spam classifier. | |
| The pipeline is a FeatureUnion over three blocks: | |
| 1. Word-level TF-IDF (1+2 grams) — captures vocabulary patterns | |
| ("free", "prize guaranteed"). | |
| 2. Character-level TF-IDF (3-5 grams, char_wb) — captures sub-word | |
| patterns and spelling variants ("Fr3e" shares character pieces with | |
| "free"). Same idea FastText popularised, sklearn-compatible. | |
| 3. Hand-crafted surface features — length, digit ratio, uppercase ratio, | |
| punctuation counts, has-URL / has-phone / has-currency booleans. | |
| No explicit stop_words list. max_df=0.95 plus IDF weighting handles | |
| common-word suppression more principled-ly than sklearn's default | |
| English stop list, which would remove domain-meaningful words like | |
| "call". | |
| """ | |
| from __future__ import annotations | |
| import re | |
| import numpy as np | |
| from scipy import sparse | |
| from sklearn.base import BaseEstimator, TransformerMixin | |
| from sklearn.feature_extraction.text import TfidfVectorizer | |
| from sklearn.pipeline import FeatureUnion | |
| URL_RE = re.compile(r"\bhttps?://\S+|\bwww\.\S+|\.co\.uk\b|\.com\b", re.IGNORECASE) | |
| PHONE_RE = re.compile(r"\b\d{4,}\b") | |
| CURRENCY_RE = re.compile(r"[£$€]|\b(?:GBP|USD|EUR)\b", re.IGNORECASE) | |
| class SurfaceFeatures(BaseEstimator, TransformerMixin): | |
| """Hand-crafted features that encode 'spam looks visibly different'. | |
| All features are normalised to roughly [0, 1] at extraction time, so | |
| no separate scaler is needed in the pipeline. This keeps the pickled | |
| model robust across sklearn versions (sklearn pickles are notoriously | |
| fragile across MaxAbsScaler / StandardScaler internals between releases). | |
| """ | |
| feature_names = [ | |
| "length_norm", # length / 200 (most SMS are under 200 chars), clipped to 1 | |
| "n_words_norm", # n_words / 50 (most SMS are under 50 words), clipped to 1 | |
| "digit_ratio", # already 0..1 | |
| "upper_ratio", # already 0..1 | |
| "punct_ratio", # already 0..1 | |
| "has_url", | |
| "has_phone", | |
| "has_currency", | |
| "n_exclamation_norm", # exclamations / 10, clipped to 1 | |
| ] | |
| def fit(self, X, y=None): | |
| return self | |
| def transform(self, X): | |
| rows = [] | |
| for msg in X: | |
| if not msg: | |
| rows.append([0.0] * len(self.feature_names)) | |
| continue | |
| length = len(msg) | |
| words = msg.split() | |
| n_words = len(words) | |
| digits = sum(1 for c in msg if c.isdigit()) | |
| uppers = sum(1 for c in msg if c.isupper()) | |
| puncts = sum(1 for c in msg if c in ".,!?;:") | |
| exclamation = msg.count("!") | |
| rows.append([ | |
| min(length / 200.0, 1.0), | |
| min(n_words / 50.0, 1.0), | |
| digits / length if length else 0.0, | |
| uppers / length if length else 0.0, | |
| puncts / length if length else 0.0, | |
| 1.0 if URL_RE.search(msg) else 0.0, | |
| 1.0 if PHONE_RE.search(msg) else 0.0, | |
| 1.0 if CURRENCY_RE.search(msg) else 0.0, | |
| min(exclamation / 10.0, 1.0), | |
| ]) | |
| return sparse.csr_matrix(np.asarray(rows, dtype=np.float64)) | |
| def build_feature_pipeline() -> FeatureUnion: | |
| """Word TF-IDF + character TF-IDF + surface features, all scaled.""" | |
| word_tfidf = TfidfVectorizer( | |
| lowercase=True, | |
| ngram_range=(1, 2), | |
| min_df=2, | |
| max_df=0.95, | |
| sublinear_tf=True, | |
| strip_accents="unicode", | |
| ) | |
| char_tfidf = TfidfVectorizer( | |
| analyzer="char_wb", | |
| ngram_range=(3, 5), | |
| min_df=2, | |
| max_df=0.95, | |
| sublinear_tf=True, | |
| ) | |
| return FeatureUnion( | |
| transformer_list=[ | |
| ("word_tfidf", word_tfidf), | |
| ("char_tfidf", char_tfidf), | |
| ("surface", SurfaceFeatures()), | |
| ], | |
| n_jobs=None, | |
| ) | |