File size: 1,407 Bytes
a783939
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
"""
Text preprocessing for the fraud classifier.

The classifier sees noisy text — either user-typed messages or
Whisper transcripts which routinely insert/drop digits, mangle proper
nouns and produce homophone errors. To stop the model from memorising
specific amounts ("500 000 тенге") or specific phishing domains
("kaspi-bannk.com"), we normalise:

* Unicode → NFC (so "Қ" composes the same way regardless of input source).
* Lowercase (case is not semantically meaningful for fraud detection).
* URLs → ``<URL>`` token.
* Number-like sequences (incl. spaced thousands separators) → ``<NUM>``.
* Repeated whitespace → single space.

The same function is applied at train time (in ``ml_training/train.py``)
and at inference time (in ``app.ml.classifier``) so the model only ever
sees normalised text.
"""
from __future__ import annotations

import re
import unicodedata

_URL_RE = re.compile(
    r"(?:https?://|www\.)\S+|"
    r"\b[\w\-]+\.(?:com|kz|online|live|ru|net|org|info|site|store|app)\b/?\S*",
    re.IGNORECASE,
)
_NUM_RE = re.compile(r"\d[\d  .,]*\d|\d")
_WS_RE = re.compile(r"\s+")


def normalize_for_classifier(text: str) -> str:
    if not text:
        return ""
    text = unicodedata.normalize("NFC", text)
    text = text.lower()
    text = _URL_RE.sub(" <url> ", text)
    text = _NUM_RE.sub(" <num> ", text)
    text = _WS_RE.sub(" ", text).strip()
    return text