Spaces:
Sleeping
Sleeping
| import re | |
| import unicodedata | |
| def _merge_spaced_letter_chains(text: str) -> str: | |
| def _single_alpha(tok: str) -> str: | |
| cleaned = re.sub(r"[^a-zA-ZçğıöşüÇĞİÖŞÜ]", "", tok) | |
| return cleaned if len(cleaned) == 1 and cleaned.isalpha() else "" | |
| tokens = text.split() | |
| if not tokens: | |
| return text | |
| merged = [] | |
| i = 0 | |
| n = len(tokens) | |
| while i < n: | |
| tok = tokens[i] | |
| single = _single_alpha(tok) | |
| if single: | |
| letters = [single] | |
| j = i + 1 | |
| while j < n: | |
| next_single = _single_alpha(tokens[j]) | |
| if not next_single: | |
| break | |
| letters.append(next_single) | |
| j += 1 | |
| # Join only real obfuscation chains like "g e r i z e k a l i". | |
| if len(letters) >= 2: | |
| merged.append("".join(letters)) | |
| else: | |
| merged.append(tok) | |
| i = j | |
| continue | |
| merged.append(tok) | |
| i += 1 | |
| return " ".join(merged) | |
| def clean_text_nfkc(text: str) -> str: | |
| text = unicodedata.normalize('NFKC', str(text)) | |
| text = text.replace('İ', 'i').replace('I', 'ı').lower() | |
| text = re.sub(r'(?<=[a-zğüşıöç0-9])[\.\-_\*]+(?=[a-zğüşıöç0-9])', '', text) | |
| leet_map = {'0': 'o', '1': 'i', '3': 'e', '4': 'a', '5': 's', '7': 't', '8': 'b'} | |
| for key, value in leet_map.items(): | |
| text = text.replace(key, value) | |
| # Keep natural double letters (e.g., "kullanici") and only squash exaggerated repeats. | |
| text = re.sub(r'(.)\1{2,}', r'\1', text) | |
| text = " ".join(text.split()) | |
| return _merge_spaced_letter_chains(text) | |
| def check_blacklist(text: str, blacklist_set: set) -> bool: | |
| return bool(set(text.split()) & blacklist_set) | |
| def is_spam(temiz: str, dil: str = "tr", ham_metin: str = "") -> bool: | |
| # Raw-text check protects against normalization hiding exaggerated repeats. | |
| raw_text = str(ham_metin) if ham_metin else temiz | |
| raw_tokens = [t for t in raw_text.split() if t] | |
| if len(raw_tokens) == 1 and re.search(r'(.)\1{5,}', raw_text.lower()): | |
| return True | |
| sadece_harf = re.sub(r'[^a-zğüşıöç]', '', temiz) | |
| n = len(sadece_harf) | |
| if n < 2: | |
| return True | |
| sesli = set('aeıioöuüeiou') | |
| sesli_oran = sum(1 for c in sadece_harf if c in sesli) / max(n, 1) | |
| if 5 < n < 100 and sesli_oran < 0.15: | |
| return True | |
| if dil == "tr": | |
| tr_olmayan = set('wqx') | |
| tr_olmayan_oran = sum(1 for c in sadece_harf if c in tr_olmayan) / max(n, 1) | |
| if tr_olmayan_oran > 0.2: | |
| return True | |
| unique_chars = len(set(sadece_harf)) | |
| if 10 < n < 50: | |
| if unique_chars / n < 0.25: | |
| return True | |
| elif n >= 50: | |
| if unique_chars < 8: | |
| return True | |
| if re.search(r'(.)\1{6,}', temiz): | |
| return True | |
| n_temiz = len(temiz) | |
| for blok in range(3, min(10, n_temiz // 2 + 1)): | |
| pattern = temiz[:blok] | |
| tekrar = temiz.count(pattern) | |
| if tekrar >= 4 and tekrar * blok >= n_temiz * 0.7: | |
| return True | |
| return False | |