Spaces:

moztrk
/

sentinel-api

Sleeping

sentinel-api / app /utils /text_utils.py

Mustafa Öztürk

Add raw repeat spam check with threshold 6

27e66da 12 days ago

3.15 kB

	import re
	import unicodedata


	def _merge_spaced_letter_chains(text: str) -> str:
	def _single_alpha(tok: str) -> str:
	cleaned = re.sub(r"[^a-zA-ZçğıöşüÇĞİÖŞÜ]", "", tok)
	return cleaned if len(cleaned) == 1 and cleaned.isalpha() else ""

	tokens = text.split()
	if not tokens:
	return text

	merged = []
	i = 0
	n = len(tokens)

	while i < n:
	tok = tokens[i]
	single = _single_alpha(tok)
	if single:
	letters = [single]
	j = i + 1
	while j < n:
	next_single = _single_alpha(tokens[j])
	if not next_single:
	break
	letters.append(next_single)
	j += 1

	# Join only real obfuscation chains like "g e r i z e k a l i".
	if len(letters) >= 2:
	merged.append("".join(letters))
	else:
	merged.append(tok)
	i = j
	continue

	merged.append(tok)
	i += 1

	return " ".join(merged)


	def clean_text_nfkc(text: str) -> str:
	text = unicodedata.normalize('NFKC', str(text))
	text = text.replace('İ', 'i').replace('I', 'ı').lower()
	text = re.sub(r'(?<=[a-zğüşıöç0-9])[\.\-_\*]+(?=[a-zğüşıöç0-9])', '', text)
	leet_map = {'0': 'o', '1': 'i', '3': 'e', '4': 'a', '5': 's', '7': 't', '8': 'b'}
	for key, value in leet_map.items():
	text = text.replace(key, value)
	# Keep natural double letters (e.g., "kullanici") and only squash exaggerated repeats.
	text = re.sub(r'(.)\1{2,}', r'\1', text)
	text = " ".join(text.split())
	return _merge_spaced_letter_chains(text)


	def check_blacklist(text: str, blacklist_set: set) -> bool:
	return bool(set(text.split()) & blacklist_set)


	def is_spam(temiz: str, dil: str = "tr", ham_metin: str = "") -> bool:
	# Raw-text check protects against normalization hiding exaggerated repeats.
	raw_text = str(ham_metin) if ham_metin else temiz
	raw_tokens = [t for t in raw_text.split() if t]
	if len(raw_tokens) == 1 and re.search(r'(.)\1{5,}', raw_text.lower()):
	return True

	sadece_harf = re.sub(r'[^a-zğüşıöç]', '', temiz)
	n = len(sadece_harf)

	if n < 2:
	return True

	sesli = set('aeıioöuüeiou')
	sesli_oran = sum(1 for c in sadece_harf if c in sesli) / max(n, 1)
	if 5 < n < 100 and sesli_oran < 0.15:
	return True

	if dil == "tr":
	tr_olmayan = set('wqx')
	tr_olmayan_oran = sum(1 for c in sadece_harf if c in tr_olmayan) / max(n, 1)
	if tr_olmayan_oran > 0.2:
	return True

	unique_chars = len(set(sadece_harf))
	if 10 < n < 50:
	if unique_chars / n < 0.25:
	return True
	elif n >= 50:
	if unique_chars < 8:
	return True

	if re.search(r'(.)\1{6,}', temiz):
	return True

	n_temiz = len(temiz)
	for blok in range(3, min(10, n_temiz // 2 + 1)):
	pattern = temiz[:blok]
	tekrar = temiz.count(pattern)
	if tekrar >= 4 and tekrar * blok >= n_temiz * 0.7:
	return True

	return False