Spaces:

chotam
/

fraud-detector-api

Sleeping

Deploy fraud detector API

a783939 2 days ago

1.41 kB

	"""
	Text preprocessing for the fraud classifier.

	The classifier sees noisy text — either user-typed messages or
	Whisper transcripts which routinely insert/drop digits, mangle proper
	nouns and produce homophone errors. To stop the model from memorising
	specific amounts ("500 000 тенге") or specific phishing domains
	("kaspi-bannk.com"), we normalise:

	* Unicode → NFC (so "Қ" composes the same way regardless of input source).
	* Lowercase (case is not semantically meaningful for fraud detection).
	* URLs → ``<URL>`` token.
	* Number-like sequences (incl. spaced thousands separators) → ``<NUM>``.
	* Repeated whitespace → single space.

	The same function is applied at train time (in ``ml_training/train.py``)
	and at inference time (in ``app.ml.classifier``) so the model only ever
	sees normalised text.
	"""
	from __future__ import annotations

	import re
	import unicodedata

	_URL_RE = re.compile(
	r"(?:https?://\|www\.)\S+\|"
	r"\b[\w\-]+\.(?:com\|kz\|online\|live\|ru\|net\|org\|info\|site\|store\|app)\b/?\S*",
	re.IGNORECASE,
	)
	_NUM_RE = re.compile(r"\d[\d .,]*\d\|\d")
	_WS_RE = re.compile(r"\s+")


	def normalize_for_classifier(text: str) -> str:
	if not text:
	return ""
	text = unicodedata.normalize("NFC", text)
	text = text.lower()
	text = _URL_RE.sub(" <url> ", text)
	text = _NUM_RE.sub(" <num> ", text)
	text = _WS_RE.sub(" ", text).strip()
	return text