Text Classification
Scikit-learn
Joblib
English
sms
spam-detection
tf-idf
linear-svm
scikit-learn
Eval Results (legacy)
Instructions to use jngb-labs/sms-spam-classical with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- Scikit-learn
How to use jngb-labs/sms-spam-classical with Scikit-learn:
from huggingface_hub import hf_hub_download import joblib model = joblib.load( hf_hub_download("jngb-labs/sms-spam-classical", "sklearn_model.joblib") ) # only load pickle files from sources you trust # read more about it here https://skops.readthedocs.io/en/stable/persistence.html - Notebooks
- Google Colab
- Kaggle
| """ | |
| Train the classical SMS spam classifier (TF-IDF + Linear SVM). | |
| Reproduces the best model from the A1 assessment: a FeatureUnion of word | |
| TF-IDF (1+2 grams), character TF-IDF (3-5 char_wb grams), and | |
| hand-crafted surface features, fed to a calibrated Linear SVM. | |
| Why CalibratedClassifierCV: LinearSVC by itself produces a decision | |
| function, not probabilities. The Space needs a probability score to | |
| display, so we wrap LinearSVC in CalibratedClassifierCV(method='sigmoid'). | |
| This applies Platt scaling and adds predict_proba without changing the | |
| underlying classifier's decisions. | |
| Cross-validation: stratified 5-fold to estimate generalisation, then | |
| final fit on all the data for the deployed artifact. | |
| Usage: | |
| python scripts/train.py \\ | |
| --data ../sms-spam/data.csv \\ | |
| --out model/classifier.joblib \\ | |
| --report model/cv_report.json | |
| """ | |
| from __future__ import annotations | |
| import argparse | |
| import json | |
| import sys | |
| from pathlib import Path | |
| import joblib | |
| import numpy as np | |
| import pandas as pd | |
| from sklearn.calibration import CalibratedClassifierCV | |
| from sklearn.metrics import ( | |
| accuracy_score, | |
| classification_report, | |
| confusion_matrix, | |
| f1_score, | |
| precision_score, | |
| recall_score, | |
| ) | |
| from sklearn.model_selection import StratifiedKFold | |
| from sklearn.pipeline import Pipeline | |
| from sklearn.svm import LinearSVC | |
| ROOT = Path(__file__).resolve().parents[1] | |
| sys.path.insert(0, str(ROOT / "src")) | |
| from features import build_feature_pipeline # noqa: E402 | |
| LABELS = ["ham", "spam"] | |
| SPAM_INDEX = 1 | |
| def build_model() -> Pipeline: | |
| """Full sklearn pipeline: features then calibrated linear SVM.""" | |
| return Pipeline([ | |
| ("features", build_feature_pipeline()), | |
| ("clf", CalibratedClassifierCV( | |
| estimator=LinearSVC( | |
| C=1.0, | |
| class_weight="balanced", | |
| dual="auto", | |
| max_iter=5000, | |
| random_state=42, | |
| ), | |
| method="sigmoid", | |
| cv=3, | |
| )), | |
| ]) | |
| def cv_evaluate(X, y, n_splits: int = 5, seed: int = 42) -> dict: | |
| """Stratified k-fold CV. Returns mean/std of standard metrics.""" | |
| skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=seed) | |
| fold_metrics = [] | |
| for fold, (train_idx, val_idx) in enumerate(skf.split(X, y), start=1): | |
| model = build_model() | |
| X_train = [X[i] for i in train_idx] | |
| X_val = [X[i] for i in val_idx] | |
| y_train = y[train_idx] | |
| y_val = y[val_idx] | |
| model.fit(X_train, y_train) | |
| y_pred = model.predict(X_val) | |
| fold_metrics.append({ | |
| "fold": fold, | |
| "accuracy": float(accuracy_score(y_val, y_pred)), | |
| "spam_f1": float(f1_score(y_val, y_pred, pos_label=SPAM_INDEX)), | |
| "spam_precision": float(precision_score(y_val, y_pred, pos_label=SPAM_INDEX)), | |
| "spam_recall": float(recall_score(y_val, y_pred, pos_label=SPAM_INDEX)), | |
| }) | |
| print(f" fold {fold}: spam_f1={fold_metrics[-1]['spam_f1']:.4f}") | |
| def agg(key): | |
| vals = np.array([m[key] for m in fold_metrics]) | |
| return {"mean": float(vals.mean()), "std": float(vals.std())} | |
| return { | |
| "n_splits": n_splits, | |
| "per_fold": fold_metrics, | |
| "accuracy": agg("accuracy"), | |
| "spam_f1": agg("spam_f1"), | |
| "spam_precision": agg("spam_precision"), | |
| "spam_recall": agg("spam_recall"), | |
| } | |
| def main() -> int: | |
| parser = argparse.ArgumentParser(description=__doc__, | |
| formatter_class=argparse.RawDescriptionHelpFormatter) | |
| parser.add_argument("--data", required=True, help="Path to data.csv (label,text)") | |
| parser.add_argument("--out", required=True, help="Where to save the joblib model") | |
| parser.add_argument("--report", required=True, help="Where to save the CV report (JSON)") | |
| parser.add_argument("--seed", type=int, default=42) | |
| args = parser.parse_args() | |
| data_path = Path(args.data) | |
| out_path = Path(args.out) | |
| report_path = Path(args.report) | |
| out_path.parent.mkdir(parents=True, exist_ok=True) | |
| report_path.parent.mkdir(parents=True, exist_ok=True) | |
| print(f"Loading data from: {data_path}") | |
| df = pd.read_csv(data_path) | |
| assert {"label", "text"} <= set(df.columns), "Expected columns: label, text" | |
| print(f" rows: {len(df)} | spam: {(df['label'] == 'spam').sum()} " | |
| f"| ham: {(df['label'] == 'ham').sum()}") | |
| # Encode labels as integers using the global LABELS order | |
| label_to_int = {label: i for i, label in enumerate(LABELS)} | |
| y = df["label"].map(label_to_int).to_numpy() | |
| X = df["text"].fillna("").astype(str).tolist() | |
| print("\nRunning 5-fold stratified CV ...") | |
| cv = cv_evaluate(X, y, n_splits=5, seed=args.seed) | |
| print(f"\nCV summary:") | |
| print(f" accuracy: {cv['accuracy']['mean']:.4f} (+/- {cv['accuracy']['std']:.4f})") | |
| print(f" spam F1: {cv['spam_f1']['mean']:.4f} (+/- {cv['spam_f1']['std']:.4f})") | |
| print(f" spam precision: {cv['spam_precision']['mean']:.4f} (+/- {cv['spam_precision']['std']:.4f})") | |
| print(f" spam recall: {cv['spam_recall']['mean']:.4f} (+/- {cv['spam_recall']['std']:.4f})") | |
| print("\nFitting final model on full dataset ...") | |
| final_model = build_model() | |
| final_model.fit(X, y) | |
| # Final-fit confusion matrix on the same data (training fit, not held-out) | |
| train_pred = final_model.predict(X) | |
| train_cm = confusion_matrix(y, train_pred, labels=[0, 1]) | |
| print(f"\nTrain-fit confusion matrix [[hh, hs],[sh, ss]]: {train_cm.tolist()}") | |
| print("\n" + classification_report(y, train_pred, target_names=LABELS, digits=4)) | |
| print(f"\nSaving model -> {out_path}") | |
| joblib.dump({ | |
| "pipeline": final_model, | |
| "labels": LABELS, | |
| "metadata": { | |
| "base_model": "LinearSVC (calibrated)", | |
| "feature_pipeline": "word_tfidf(1,2) + char_tfidf(3,5) + surface", | |
| "training_rows": len(df), | |
| "spam_count": int((df["label"] == "spam").sum()), | |
| "ham_count": int((df["label"] == "ham").sum()), | |
| "seed": args.seed, | |
| }, | |
| }, out_path, compress=3) | |
| print(f"Saving CV report -> {report_path}") | |
| report_path.write_text(json.dumps({ | |
| "cv": cv, | |
| "train_fit_confusion_matrix": { | |
| "labels": LABELS, | |
| "matrix": train_cm.tolist(), | |
| }, | |
| }, indent=2)) | |
| return 0 | |
| if __name__ == "__main__": | |
| raise SystemExit(main()) | |