""" Text preprocessing for the fraud classifier. The classifier sees noisy text — either user-typed messages or Whisper transcripts which routinely insert/drop digits, mangle proper nouns and produce homophone errors. To stop the model from memorising specific amounts ("500 000 тенге") or specific phishing domains ("kaspi-bannk.com"), we normalise: * Unicode → NFC (so "Қ" composes the same way regardless of input source). * Lowercase (case is not semantically meaningful for fraud detection). * URLs → ```` token. * Number-like sequences (incl. spaced thousands separators) → ````. * Repeated whitespace → single space. The same function is applied at train time (in ``ml_training/train.py``) and at inference time (in ``app.ml.classifier``) so the model only ever sees normalised text. """ from __future__ import annotations import re import unicodedata _URL_RE = re.compile( r"(?:https?://|www\.)\S+|" r"\b[\w\-]+\.(?:com|kz|online|live|ru|net|org|info|site|store|app)\b/?\S*", re.IGNORECASE, ) _NUM_RE = re.compile(r"\d[\d  .,]*\d|\d") _WS_RE = re.compile(r"\s+") def normalize_for_classifier(text: str) -> str: if not text: return "" text = unicodedata.normalize("NFC", text) text = text.lower() text = _URL_RE.sub(" ", text) text = _NUM_RE.sub(" ", text) text = _WS_RE.sub(" ", text).strip() return text