sentinel-api / app /utils /text_utils.py
Mustafa Öztürk
Add raw repeat spam check with threshold 6
27e66da
import re
import unicodedata
def _merge_spaced_letter_chains(text: str) -> str:
def _single_alpha(tok: str) -> str:
cleaned = re.sub(r"[^a-zA-ZçğıöşüÇĞİÖŞÜ]", "", tok)
return cleaned if len(cleaned) == 1 and cleaned.isalpha() else ""
tokens = text.split()
if not tokens:
return text
merged = []
i = 0
n = len(tokens)
while i < n:
tok = tokens[i]
single = _single_alpha(tok)
if single:
letters = [single]
j = i + 1
while j < n:
next_single = _single_alpha(tokens[j])
if not next_single:
break
letters.append(next_single)
j += 1
# Join only real obfuscation chains like "g e r i z e k a l i".
if len(letters) >= 2:
merged.append("".join(letters))
else:
merged.append(tok)
i = j
continue
merged.append(tok)
i += 1
return " ".join(merged)
def clean_text_nfkc(text: str) -> str:
text = unicodedata.normalize('NFKC', str(text))
text = text.replace('İ', 'i').replace('I', 'ı').lower()
text = re.sub(r'(?<=[a-zğüşıöç0-9])[\.\-_\*]+(?=[a-zğüşıöç0-9])', '', text)
leet_map = {'0': 'o', '1': 'i', '3': 'e', '4': 'a', '5': 's', '7': 't', '8': 'b'}
for key, value in leet_map.items():
text = text.replace(key, value)
# Keep natural double letters (e.g., "kullanici") and only squash exaggerated repeats.
text = re.sub(r'(.)\1{2,}', r'\1', text)
text = " ".join(text.split())
return _merge_spaced_letter_chains(text)
def check_blacklist(text: str, blacklist_set: set) -> bool:
return bool(set(text.split()) & blacklist_set)
def is_spam(temiz: str, dil: str = "tr", ham_metin: str = "") -> bool:
# Raw-text check protects against normalization hiding exaggerated repeats.
raw_text = str(ham_metin) if ham_metin else temiz
raw_tokens = [t for t in raw_text.split() if t]
if len(raw_tokens) == 1 and re.search(r'(.)\1{5,}', raw_text.lower()):
return True
sadece_harf = re.sub(r'[^a-zğüşıöç]', '', temiz)
n = len(sadece_harf)
if n < 2:
return True
sesli = set('aeıioöuüeiou')
sesli_oran = sum(1 for c in sadece_harf if c in sesli) / max(n, 1)
if 5 < n < 100 and sesli_oran < 0.15:
return True
if dil == "tr":
tr_olmayan = set('wqx')
tr_olmayan_oran = sum(1 for c in sadece_harf if c in tr_olmayan) / max(n, 1)
if tr_olmayan_oran > 0.2:
return True
unique_chars = len(set(sadece_harf))
if 10 < n < 50:
if unique_chars / n < 0.25:
return True
elif n >= 50:
if unique_chars < 8:
return True
if re.search(r'(.)\1{6,}', temiz):
return True
n_temiz = len(temiz)
for blok in range(3, min(10, n_temiz // 2 + 1)):
pattern = temiz[:blok]
tekrar = temiz.count(pattern)
if tekrar >= 4 and tekrar * blok >= n_temiz * 0.7:
return True
return False