Jakob Neugebauer
Initial release: TF-IDF + Linear SVM trained on jngb-labs/sms-spam
402d22c
"""
Train the classical SMS spam classifier (TF-IDF + Linear SVM).
Reproduces the best model from the A1 assessment: a FeatureUnion of word
TF-IDF (1+2 grams), character TF-IDF (3-5 char_wb grams), and
hand-crafted surface features, fed to a calibrated Linear SVM.
Why CalibratedClassifierCV: LinearSVC by itself produces a decision
function, not probabilities. The Space needs a probability score to
display, so we wrap LinearSVC in CalibratedClassifierCV(method='sigmoid').
This applies Platt scaling and adds predict_proba without changing the
underlying classifier's decisions.
Cross-validation: stratified 5-fold to estimate generalisation, then
final fit on all the data for the deployed artifact.
Usage:
python scripts/train.py \\
--data ../sms-spam/data.csv \\
--out model/classifier.joblib \\
--report model/cv_report.json
"""
from __future__ import annotations
import argparse
import json
import sys
from pathlib import Path
import joblib
import numpy as np
import pandas as pd
from sklearn.calibration import CalibratedClassifierCV
from sklearn.metrics import (
accuracy_score,
classification_report,
confusion_matrix,
f1_score,
precision_score,
recall_score,
)
from sklearn.model_selection import StratifiedKFold
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC
ROOT = Path(__file__).resolve().parents[1]
sys.path.insert(0, str(ROOT / "src"))
from features import build_feature_pipeline # noqa: E402
LABELS = ["ham", "spam"]
SPAM_INDEX = 1
def build_model() -> Pipeline:
"""Full sklearn pipeline: features then calibrated linear SVM."""
return Pipeline([
("features", build_feature_pipeline()),
("clf", CalibratedClassifierCV(
estimator=LinearSVC(
C=1.0,
class_weight="balanced",
dual="auto",
max_iter=5000,
random_state=42,
),
method="sigmoid",
cv=3,
)),
])
def cv_evaluate(X, y, n_splits: int = 5, seed: int = 42) -> dict:
"""Stratified k-fold CV. Returns mean/std of standard metrics."""
skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=seed)
fold_metrics = []
for fold, (train_idx, val_idx) in enumerate(skf.split(X, y), start=1):
model = build_model()
X_train = [X[i] for i in train_idx]
X_val = [X[i] for i in val_idx]
y_train = y[train_idx]
y_val = y[val_idx]
model.fit(X_train, y_train)
y_pred = model.predict(X_val)
fold_metrics.append({
"fold": fold,
"accuracy": float(accuracy_score(y_val, y_pred)),
"spam_f1": float(f1_score(y_val, y_pred, pos_label=SPAM_INDEX)),
"spam_precision": float(precision_score(y_val, y_pred, pos_label=SPAM_INDEX)),
"spam_recall": float(recall_score(y_val, y_pred, pos_label=SPAM_INDEX)),
})
print(f" fold {fold}: spam_f1={fold_metrics[-1]['spam_f1']:.4f}")
def agg(key):
vals = np.array([m[key] for m in fold_metrics])
return {"mean": float(vals.mean()), "std": float(vals.std())}
return {
"n_splits": n_splits,
"per_fold": fold_metrics,
"accuracy": agg("accuracy"),
"spam_f1": agg("spam_f1"),
"spam_precision": agg("spam_precision"),
"spam_recall": agg("spam_recall"),
}
def main() -> int:
parser = argparse.ArgumentParser(description=__doc__,
formatter_class=argparse.RawDescriptionHelpFormatter)
parser.add_argument("--data", required=True, help="Path to data.csv (label,text)")
parser.add_argument("--out", required=True, help="Where to save the joblib model")
parser.add_argument("--report", required=True, help="Where to save the CV report (JSON)")
parser.add_argument("--seed", type=int, default=42)
args = parser.parse_args()
data_path = Path(args.data)
out_path = Path(args.out)
report_path = Path(args.report)
out_path.parent.mkdir(parents=True, exist_ok=True)
report_path.parent.mkdir(parents=True, exist_ok=True)
print(f"Loading data from: {data_path}")
df = pd.read_csv(data_path)
assert {"label", "text"} <= set(df.columns), "Expected columns: label, text"
print(f" rows: {len(df)} | spam: {(df['label'] == 'spam').sum()} "
f"| ham: {(df['label'] == 'ham').sum()}")
# Encode labels as integers using the global LABELS order
label_to_int = {label: i for i, label in enumerate(LABELS)}
y = df["label"].map(label_to_int).to_numpy()
X = df["text"].fillna("").astype(str).tolist()
print("\nRunning 5-fold stratified CV ...")
cv = cv_evaluate(X, y, n_splits=5, seed=args.seed)
print(f"\nCV summary:")
print(f" accuracy: {cv['accuracy']['mean']:.4f} (+/- {cv['accuracy']['std']:.4f})")
print(f" spam F1: {cv['spam_f1']['mean']:.4f} (+/- {cv['spam_f1']['std']:.4f})")
print(f" spam precision: {cv['spam_precision']['mean']:.4f} (+/- {cv['spam_precision']['std']:.4f})")
print(f" spam recall: {cv['spam_recall']['mean']:.4f} (+/- {cv['spam_recall']['std']:.4f})")
print("\nFitting final model on full dataset ...")
final_model = build_model()
final_model.fit(X, y)
# Final-fit confusion matrix on the same data (training fit, not held-out)
train_pred = final_model.predict(X)
train_cm = confusion_matrix(y, train_pred, labels=[0, 1])
print(f"\nTrain-fit confusion matrix [[hh, hs],[sh, ss]]: {train_cm.tolist()}")
print("\n" + classification_report(y, train_pred, target_names=LABELS, digits=4))
print(f"\nSaving model -> {out_path}")
joblib.dump({
"pipeline": final_model,
"labels": LABELS,
"metadata": {
"base_model": "LinearSVC (calibrated)",
"feature_pipeline": "word_tfidf(1,2) + char_tfidf(3,5) + surface",
"training_rows": len(df),
"spam_count": int((df["label"] == "spam").sum()),
"ham_count": int((df["label"] == "ham").sum()),
"seed": args.seed,
},
}, out_path, compress=3)
print(f"Saving CV report -> {report_path}")
report_path.write_text(json.dumps({
"cv": cv,
"train_fit_confusion_matrix": {
"labels": LABELS,
"matrix": train_cm.tolist(),
},
}, indent=2))
return 0
if __name__ == "__main__":
raise SystemExit(main())