import re import unicodedata def _merge_spaced_letter_chains(text: str) -> str: def _single_alpha(tok: str) -> str: cleaned = re.sub(r"[^a-zA-ZçğıöşüÇĞİÖŞÜ]", "", tok) return cleaned if len(cleaned) == 1 and cleaned.isalpha() else "" tokens = text.split() if not tokens: return text merged = [] i = 0 n = len(tokens) while i < n: tok = tokens[i] single = _single_alpha(tok) if single: letters = [single] j = i + 1 while j < n: next_single = _single_alpha(tokens[j]) if not next_single: break letters.append(next_single) j += 1 # Join only real obfuscation chains like "g e r i z e k a l i". if len(letters) >= 2: merged.append("".join(letters)) else: merged.append(tok) i = j continue merged.append(tok) i += 1 return " ".join(merged) def clean_text_nfkc(text: str) -> str: text = unicodedata.normalize('NFKC', str(text)) text = text.replace('İ', 'i').replace('I', 'ı').lower() text = re.sub(r'(?<=[a-zğüşıöç0-9])[\.\-_\*]+(?=[a-zğüşıöç0-9])', '', text) leet_map = {'0': 'o', '1': 'i', '3': 'e', '4': 'a', '5': 's', '7': 't', '8': 'b'} for key, value in leet_map.items(): text = text.replace(key, value) # Keep natural double letters (e.g., "kullanici") and only squash exaggerated repeats. text = re.sub(r'(.)\1{2,}', r'\1', text) text = " ".join(text.split()) return _merge_spaced_letter_chains(text) def check_blacklist(text: str, blacklist_set: set) -> bool: return bool(set(text.split()) & blacklist_set) def is_spam(temiz: str, dil: str = "tr", ham_metin: str = "") -> bool: # Raw-text check protects against normalization hiding exaggerated repeats. raw_text = str(ham_metin) if ham_metin else temiz raw_tokens = [t for t in raw_text.split() if t] if len(raw_tokens) == 1 and re.search(r'(.)\1{5,}', raw_text.lower()): return True sadece_harf = re.sub(r'[^a-zğüşıöç]', '', temiz) n = len(sadece_harf) if n < 2: return True sesli = set('aeıioöuüeiou') sesli_oran = sum(1 for c in sadece_harf if c in sesli) / max(n, 1) if 5 < n < 100 and sesli_oran < 0.15: return True if dil == "tr": tr_olmayan = set('wqx') tr_olmayan_oran = sum(1 for c in sadece_harf if c in tr_olmayan) / max(n, 1) if tr_olmayan_oran > 0.2: return True unique_chars = len(set(sadece_harf)) if 10 < n < 50: if unique_chars / n < 0.25: return True elif n >= 50: if unique_chars < 8: return True if re.search(r'(.)\1{6,}', temiz): return True n_temiz = len(temiz) for blok in range(3, min(10, n_temiz // 2 + 1)): pattern = temiz[:blok] tekrar = temiz.count(pattern) if tekrar >= 4 and tekrar * blok >= n_temiz * 0.7: return True return False