"""
Feature pipeline for the SMS spam classifier.

The pipeline is a FeatureUnion over three blocks:

1. Word-level TF-IDF (1+2 grams) — captures vocabulary patterns
   ("free", "prize guaranteed").
2. Character-level TF-IDF (3-5 grams, char_wb) — captures sub-word
   patterns and spelling variants ("Fr3e" shares character pieces with
   "free"). Same idea FastText popularised, sklearn-compatible.
3. Hand-crafted surface features — length, digit ratio, uppercase ratio,
   punctuation counts, has-URL / has-phone / has-currency booleans.

No explicit stop_words list. max_df=0.95 plus IDF weighting handles
common-word suppression more principled-ly than sklearn's default
English stop list, which would remove domain-meaningful words like
"call".
"""

from __future__ import annotations

import re

import numpy as np
from scipy import sparse
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import FeatureUnion

URL_RE = re.compile(r"\bhttps?://\S+|\bwww\.\S+|\.co\.uk\b|\.com\b", re.IGNORECASE)
PHONE_RE = re.compile(r"\b\d{4,}\b")
CURRENCY_RE = re.compile(r"[£$€]|\b(?:GBP|USD|EUR)\b", re.IGNORECASE)


class SurfaceFeatures(BaseEstimator, TransformerMixin):
    """Hand-crafted features that encode 'spam looks visibly different'.

    All features are normalised to roughly [0, 1] at extraction time, so
    no separate scaler is needed in the pipeline. This keeps the pickled
    model robust across sklearn versions (sklearn pickles are notoriously
    fragile across MaxAbsScaler / StandardScaler internals between releases).
    """

    feature_names = [
        "length_norm",      # length / 200 (most SMS are under 200 chars), clipped to 1
        "n_words_norm",     # n_words / 50 (most SMS are under 50 words), clipped to 1
        "digit_ratio",      # already 0..1
        "upper_ratio",      # already 0..1
        "punct_ratio",      # already 0..1
        "has_url",
        "has_phone",
        "has_currency",
        "n_exclamation_norm",  # exclamations / 10, clipped to 1
    ]

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        rows = []
        for msg in X:
            if not msg:
                rows.append([0.0] * len(self.feature_names))
                continue
            length = len(msg)
            words = msg.split()
            n_words = len(words)
            digits = sum(1 for c in msg if c.isdigit())
            uppers = sum(1 for c in msg if c.isupper())
            puncts = sum(1 for c in msg if c in ".,!?;:")
            exclamation = msg.count("!")
            rows.append([
                min(length / 200.0, 1.0),
                min(n_words / 50.0, 1.0),
                digits / length if length else 0.0,
                uppers / length if length else 0.0,
                puncts / length if length else 0.0,
                1.0 if URL_RE.search(msg) else 0.0,
                1.0 if PHONE_RE.search(msg) else 0.0,
                1.0 if CURRENCY_RE.search(msg) else 0.0,
                min(exclamation / 10.0, 1.0),
            ])
        return sparse.csr_matrix(np.asarray(rows, dtype=np.float64))


def build_feature_pipeline() -> FeatureUnion:
    """Word TF-IDF + character TF-IDF + surface features, all scaled."""
    word_tfidf = TfidfVectorizer(
        lowercase=True,
        ngram_range=(1, 2),
        min_df=2,
        max_df=0.95,
        sublinear_tf=True,
        strip_accents="unicode",
    )
    char_tfidf = TfidfVectorizer(
        analyzer="char_wb",
        ngram_range=(3, 5),
        min_df=2,
        max_df=0.95,
        sublinear_tf=True,
    )
    return FeatureUnion(
        transformer_list=[
            ("word_tfidf", word_tfidf),
            ("char_tfidf", char_tfidf),
            ("surface", SurfaceFeatures()),
        ],
        n_jobs=None,
    )