""" Train the classical SMS spam classifier (TF-IDF + Linear SVM). Reproduces the best model from the A1 assessment: a FeatureUnion of word TF-IDF (1+2 grams), character TF-IDF (3-5 char_wb grams), and hand-crafted surface features, fed to a calibrated Linear SVM. Why CalibratedClassifierCV: LinearSVC by itself produces a decision function, not probabilities. The Space needs a probability score to display, so we wrap LinearSVC in CalibratedClassifierCV(method='sigmoid'). This applies Platt scaling and adds predict_proba without changing the underlying classifier's decisions. Cross-validation: stratified 5-fold to estimate generalisation, then final fit on all the data for the deployed artifact. Usage: python scripts/train.py \\ --data ../sms-spam/data.csv \\ --out model/classifier.joblib \\ --report model/cv_report.json """ from __future__ import annotations import argparse import json import sys from pathlib import Path import joblib import numpy as np import pandas as pd from sklearn.calibration import CalibratedClassifierCV from sklearn.metrics import ( accuracy_score, classification_report, confusion_matrix, f1_score, precision_score, recall_score, ) from sklearn.model_selection import StratifiedKFold from sklearn.pipeline import Pipeline from sklearn.svm import LinearSVC ROOT = Path(__file__).resolve().parents[1] sys.path.insert(0, str(ROOT / "src")) from features import build_feature_pipeline # noqa: E402 LABELS = ["ham", "spam"] SPAM_INDEX = 1 def build_model() -> Pipeline: """Full sklearn pipeline: features then calibrated linear SVM.""" return Pipeline([ ("features", build_feature_pipeline()), ("clf", CalibratedClassifierCV( estimator=LinearSVC( C=1.0, class_weight="balanced", dual="auto", max_iter=5000, random_state=42, ), method="sigmoid", cv=3, )), ]) def cv_evaluate(X, y, n_splits: int = 5, seed: int = 42) -> dict: """Stratified k-fold CV. Returns mean/std of standard metrics.""" skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=seed) fold_metrics = [] for fold, (train_idx, val_idx) in enumerate(skf.split(X, y), start=1): model = build_model() X_train = [X[i] for i in train_idx] X_val = [X[i] for i in val_idx] y_train = y[train_idx] y_val = y[val_idx] model.fit(X_train, y_train) y_pred = model.predict(X_val) fold_metrics.append({ "fold": fold, "accuracy": float(accuracy_score(y_val, y_pred)), "spam_f1": float(f1_score(y_val, y_pred, pos_label=SPAM_INDEX)), "spam_precision": float(precision_score(y_val, y_pred, pos_label=SPAM_INDEX)), "spam_recall": float(recall_score(y_val, y_pred, pos_label=SPAM_INDEX)), }) print(f" fold {fold}: spam_f1={fold_metrics[-1]['spam_f1']:.4f}") def agg(key): vals = np.array([m[key] for m in fold_metrics]) return {"mean": float(vals.mean()), "std": float(vals.std())} return { "n_splits": n_splits, "per_fold": fold_metrics, "accuracy": agg("accuracy"), "spam_f1": agg("spam_f1"), "spam_precision": agg("spam_precision"), "spam_recall": agg("spam_recall"), } def main() -> int: parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter) parser.add_argument("--data", required=True, help="Path to data.csv (label,text)") parser.add_argument("--out", required=True, help="Where to save the joblib model") parser.add_argument("--report", required=True, help="Where to save the CV report (JSON)") parser.add_argument("--seed", type=int, default=42) args = parser.parse_args() data_path = Path(args.data) out_path = Path(args.out) report_path = Path(args.report) out_path.parent.mkdir(parents=True, exist_ok=True) report_path.parent.mkdir(parents=True, exist_ok=True) print(f"Loading data from: {data_path}") df = pd.read_csv(data_path) assert {"label", "text"} <= set(df.columns), "Expected columns: label, text" print(f" rows: {len(df)} | spam: {(df['label'] == 'spam').sum()} " f"| ham: {(df['label'] == 'ham').sum()}") # Encode labels as integers using the global LABELS order label_to_int = {label: i for i, label in enumerate(LABELS)} y = df["label"].map(label_to_int).to_numpy() X = df["text"].fillna("").astype(str).tolist() print("\nRunning 5-fold stratified CV ...") cv = cv_evaluate(X, y, n_splits=5, seed=args.seed) print(f"\nCV summary:") print(f" accuracy: {cv['accuracy']['mean']:.4f} (+/- {cv['accuracy']['std']:.4f})") print(f" spam F1: {cv['spam_f1']['mean']:.4f} (+/- {cv['spam_f1']['std']:.4f})") print(f" spam precision: {cv['spam_precision']['mean']:.4f} (+/- {cv['spam_precision']['std']:.4f})") print(f" spam recall: {cv['spam_recall']['mean']:.4f} (+/- {cv['spam_recall']['std']:.4f})") print("\nFitting final model on full dataset ...") final_model = build_model() final_model.fit(X, y) # Final-fit confusion matrix on the same data (training fit, not held-out) train_pred = final_model.predict(X) train_cm = confusion_matrix(y, train_pred, labels=[0, 1]) print(f"\nTrain-fit confusion matrix [[hh, hs],[sh, ss]]: {train_cm.tolist()}") print("\n" + classification_report(y, train_pred, target_names=LABELS, digits=4)) print(f"\nSaving model -> {out_path}") joblib.dump({ "pipeline": final_model, "labels": LABELS, "metadata": { "base_model": "LinearSVC (calibrated)", "feature_pipeline": "word_tfidf(1,2) + char_tfidf(3,5) + surface", "training_rows": len(df), "spam_count": int((df["label"] == "spam").sum()), "ham_count": int((df["label"] == "ham").sum()), "seed": args.seed, }, }, out_path, compress=3) print(f"Saving CV report -> {report_path}") report_path.write_text(json.dumps({ "cv": cv, "train_fit_confusion_matrix": { "labels": LABELS, "matrix": train_cm.tolist(), }, }, indent=2)) return 0 if __name__ == "__main__": raise SystemExit(main())