Spaces:
Sleeping
Sleeping
File size: 3,145 Bytes
857d4f5 cce12b3 ddd2daa cce12b3 ddd2daa cce12b3 ddd2daa cce12b3 857d4f5 625d4d3 cce12b3 857d4f5 27e66da 857d4f5 2647c15 857d4f5 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 | import re
import unicodedata
def _merge_spaced_letter_chains(text: str) -> str:
def _single_alpha(tok: str) -> str:
cleaned = re.sub(r"[^a-zA-ZçğıöşüÇĞİÖŞÜ]", "", tok)
return cleaned if len(cleaned) == 1 and cleaned.isalpha() else ""
tokens = text.split()
if not tokens:
return text
merged = []
i = 0
n = len(tokens)
while i < n:
tok = tokens[i]
single = _single_alpha(tok)
if single:
letters = [single]
j = i + 1
while j < n:
next_single = _single_alpha(tokens[j])
if not next_single:
break
letters.append(next_single)
j += 1
# Join only real obfuscation chains like "g e r i z e k a l i".
if len(letters) >= 2:
merged.append("".join(letters))
else:
merged.append(tok)
i = j
continue
merged.append(tok)
i += 1
return " ".join(merged)
def clean_text_nfkc(text: str) -> str:
text = unicodedata.normalize('NFKC', str(text))
text = text.replace('İ', 'i').replace('I', 'ı').lower()
text = re.sub(r'(?<=[a-zğüşıöç0-9])[\.\-_\*]+(?=[a-zğüşıöç0-9])', '', text)
leet_map = {'0': 'o', '1': 'i', '3': 'e', '4': 'a', '5': 's', '7': 't', '8': 'b'}
for key, value in leet_map.items():
text = text.replace(key, value)
# Keep natural double letters (e.g., "kullanici") and only squash exaggerated repeats.
text = re.sub(r'(.)\1{2,}', r'\1', text)
text = " ".join(text.split())
return _merge_spaced_letter_chains(text)
def check_blacklist(text: str, blacklist_set: set) -> bool:
return bool(set(text.split()) & blacklist_set)
def is_spam(temiz: str, dil: str = "tr", ham_metin: str = "") -> bool:
# Raw-text check protects against normalization hiding exaggerated repeats.
raw_text = str(ham_metin) if ham_metin else temiz
raw_tokens = [t for t in raw_text.split() if t]
if len(raw_tokens) == 1 and re.search(r'(.)\1{5,}', raw_text.lower()):
return True
sadece_harf = re.sub(r'[^a-zğüşıöç]', '', temiz)
n = len(sadece_harf)
if n < 2:
return True
sesli = set('aeıioöuüeiou')
sesli_oran = sum(1 for c in sadece_harf if c in sesli) / max(n, 1)
if 5 < n < 100 and sesli_oran < 0.15:
return True
if dil == "tr":
tr_olmayan = set('wqx')
tr_olmayan_oran = sum(1 for c in sadece_harf if c in tr_olmayan) / max(n, 1)
if tr_olmayan_oran > 0.2:
return True
unique_chars = len(set(sadece_harf))
if 10 < n < 50:
if unique_chars / n < 0.25:
return True
elif n >= 50:
if unique_chars < 8:
return True
if re.search(r'(.)\1{6,}', temiz):
return True
n_temiz = len(temiz)
for blok in range(3, min(10, n_temiz // 2 + 1)):
pattern = temiz[:blok]
tekrar = temiz.count(pattern)
if tekrar >= 4 and tekrar * blok >= n_temiz * 0.7:
return True
return False
|