File size: 1,961 Bytes
857d4f5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
import unicodedata
import re

def clean_text_nfkc(text: str) -> str:
    text = unicodedata.normalize('NFKC', str(text))
    text = text.replace('İ', 'i').replace('I', 'ı').lower()
    text = re.sub(r'(?<=[a-zğüşıöç0-9])[\.\-_\*]+(?=[a-zğüşıöç0-9])', '', text)
    leet_map = {'0':'o', '1':'i', '3':'e', '4':'a', '5':'s', '7':'t', '8':'b'}
    for key, value in leet_map.items():
        text = text.replace(key, value)
    text = re.sub(r'(.)\1+', r'\1', text)
    return " ".join(text.split())


def check_blacklist(text: str, blacklist_set: set) -> bool:
    return bool(set(text.split()) & blacklist_set)


def is_spam(temiz: str, dil: str = "tr") -> bool:
    sadece_harf = re.sub(r'[^a-zğüşıöç]', '', temiz)
    if len(sadece_harf) < 2:
        return True
    sesli = set('aeıioöuüeiou')
    sesli_oran = sum(1 for c in sadece_harf if c in sesli) / max(len(sadece_harf), 1)
    if len(sadece_harf) > 5 and sesli_oran < 0.15:
        return True
    if dil == "tr":
        tr_olmayan = set('wqx')
        tr_olmayan_oran = sum(1 for c in sadece_harf if c in tr_olmayan) / max(len(sadece_harf), 1)
        if tr_olmayan_oran > 0.2:
            return True
    if re.search(r'(.)\1{4,}', temiz):
        return True
    n = len(temiz)
    for blok in range(2, n // 2 + 1):
        pattern = temiz[:blok]
        tekrar = len(re.findall(re.escape(pattern), temiz))
        if tekrar >= 3 and tekrar * blok >= n * 0.6:
            return True
    if len(sadece_harf) > 10 and len(set(sadece_harf)) / len(sadece_harf) < 0.25:
        return True
    spam_patterns = [
        r'http[s]?://', r'www\.', r'\.com', r'\.net', r'\.org',
        r'click\s*here', r'buy\s*cheap', r'free\s*follow',
        r'tıkla.*kazan', r'ücretsiz.*takipçi', r'satın\s*al',
        r'indirim.*%', r'subscribe.*channel',
    ]
    for pattern in spam_patterns:
        if re.search(pattern, temiz, re.IGNORECASE):
            return True
    return False