fraud-detector-api / app /services /preprocess.py
chotam's picture
Deploy fraud detector API
a783939
"""
Text preprocessing for the fraud classifier.
The classifier sees noisy text — either user-typed messages or
Whisper transcripts which routinely insert/drop digits, mangle proper
nouns and produce homophone errors. To stop the model from memorising
specific amounts ("500 000 тенге") or specific phishing domains
("kaspi-bannk.com"), we normalise:
* Unicode → NFC (so "Қ" composes the same way regardless of input source).
* Lowercase (case is not semantically meaningful for fraud detection).
* URLs → ``<URL>`` token.
* Number-like sequences (incl. spaced thousands separators) → ``<NUM>``.
* Repeated whitespace → single space.
The same function is applied at train time (in ``ml_training/train.py``)
and at inference time (in ``app.ml.classifier``) so the model only ever
sees normalised text.
"""
from __future__ import annotations
import re
import unicodedata
_URL_RE = re.compile(
r"(?:https?://|www\.)\S+|"
r"\b[\w\-]+\.(?:com|kz|online|live|ru|net|org|info|site|store|app)\b/?\S*",
re.IGNORECASE,
)
_NUM_RE = re.compile(r"\d[\d  .,]*\d|\d")
_WS_RE = re.compile(r"\s+")
def normalize_for_classifier(text: str) -> str:
if not text:
return ""
text = unicodedata.normalize("NFC", text)
text = text.lower()
text = _URL_RE.sub(" <url> ", text)
text = _NUM_RE.sub(" <num> ", text)
text = _WS_RE.sub(" ", text).strip()
return text