Spaces:
Sleeping
Sleeping
| """ | |
| Text preprocessing for the fraud classifier. | |
| The classifier sees noisy text — either user-typed messages or | |
| Whisper transcripts which routinely insert/drop digits, mangle proper | |
| nouns and produce homophone errors. To stop the model from memorising | |
| specific amounts ("500 000 тенге") or specific phishing domains | |
| ("kaspi-bannk.com"), we normalise: | |
| * Unicode → NFC (so "Қ" composes the same way regardless of input source). | |
| * Lowercase (case is not semantically meaningful for fraud detection). | |
| * URLs → ``<URL>`` token. | |
| * Number-like sequences (incl. spaced thousands separators) → ``<NUM>``. | |
| * Repeated whitespace → single space. | |
| The same function is applied at train time (in ``ml_training/train.py``) | |
| and at inference time (in ``app.ml.classifier``) so the model only ever | |
| sees normalised text. | |
| """ | |
| from __future__ import annotations | |
| import re | |
| import unicodedata | |
| _URL_RE = re.compile( | |
| r"(?:https?://|www\.)\S+|" | |
| r"\b[\w\-]+\.(?:com|kz|online|live|ru|net|org|info|site|store|app)\b/?\S*", | |
| re.IGNORECASE, | |
| ) | |
| _NUM_RE = re.compile(r"\d[\d .,]*\d|\d") | |
| _WS_RE = re.compile(r"\s+") | |
| def normalize_for_classifier(text: str) -> str: | |
| if not text: | |
| return "" | |
| text = unicodedata.normalize("NFC", text) | |
| text = text.lower() | |
| text = _URL_RE.sub(" <url> ", text) | |
| text = _NUM_RE.sub(" <num> ", text) | |
| text = _WS_RE.sub(" ", text).strip() | |
| return text | |