File size: 3,145 Bytes
857d4f5
 
 
 
cce12b3
ddd2daa
 
 
 
cce12b3
 
 
 
 
 
 
 
 
 
ddd2daa
 
 
cce12b3
ddd2daa
 
 
 
 
cce12b3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
857d4f5
 
 
 
 
 
 
625d4d3
 
cce12b3
 
857d4f5
 
 
 
 
 
27e66da
 
 
 
 
 
 
857d4f5
 
 
 
 
 
 
 
 
 
 
 
 
 
2647c15
857d4f5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
import re
import unicodedata


def _merge_spaced_letter_chains(text: str) -> str:
    def _single_alpha(tok: str) -> str:
        cleaned = re.sub(r"[^a-zA-ZçğıöşüÇĞİÖŞÜ]", "", tok)
        return cleaned if len(cleaned) == 1 and cleaned.isalpha() else ""

    tokens = text.split()
    if not tokens:
        return text

    merged = []
    i = 0
    n = len(tokens)

    while i < n:
        tok = tokens[i]
        single = _single_alpha(tok)
        if single:
            letters = [single]
            j = i + 1
            while j < n:
                next_single = _single_alpha(tokens[j])
                if not next_single:
                    break
                letters.append(next_single)
                j += 1

            # Join only real obfuscation chains like "g e r i z e k a l i".
            if len(letters) >= 2:
                merged.append("".join(letters))
            else:
                merged.append(tok)
            i = j
            continue

        merged.append(tok)
        i += 1

    return " ".join(merged)


def clean_text_nfkc(text: str) -> str:
    text = unicodedata.normalize('NFKC', str(text))
    text = text.replace('İ', 'i').replace('I', 'ı').lower()
    text = re.sub(r'(?<=[a-zğüşıöç0-9])[\.\-_\*]+(?=[a-zğüşıöç0-9])', '', text)
    leet_map = {'0': 'o', '1': 'i', '3': 'e', '4': 'a', '5': 's', '7': 't', '8': 'b'}
    for key, value in leet_map.items():
        text = text.replace(key, value)
    # Keep natural double letters (e.g., "kullanici") and only squash exaggerated repeats.
    text = re.sub(r'(.)\1{2,}', r'\1', text)
    text = " ".join(text.split())
    return _merge_spaced_letter_chains(text)


def check_blacklist(text: str, blacklist_set: set) -> bool:
    return bool(set(text.split()) & blacklist_set)


def is_spam(temiz: str, dil: str = "tr", ham_metin: str = "") -> bool:
    # Raw-text check protects against normalization hiding exaggerated repeats.
    raw_text = str(ham_metin) if ham_metin else temiz
    raw_tokens = [t for t in raw_text.split() if t]
    if len(raw_tokens) == 1 and re.search(r'(.)\1{5,}', raw_text.lower()):
        return True

    sadece_harf = re.sub(r'[^a-zğüşıöç]', '', temiz)
    n = len(sadece_harf)

    if n < 2:
        return True

    sesli = set('aeıioöuüeiou')
    sesli_oran = sum(1 for c in sadece_harf if c in sesli) / max(n, 1)
    if 5 < n < 100 and sesli_oran < 0.15:
        return True

    if dil == "tr":
        tr_olmayan = set('wqx')
        tr_olmayan_oran = sum(1 for c in sadece_harf if c in tr_olmayan) / max(n, 1)
        if tr_olmayan_oran > 0.2:
            return True

    unique_chars = len(set(sadece_harf))
    if 10 < n < 50:
        if unique_chars / n < 0.25:
            return True
    elif n >= 50:
        if unique_chars < 8:
            return True

    if re.search(r'(.)\1{6,}', temiz):
        return True

    n_temiz = len(temiz)
    for blok in range(3, min(10, n_temiz // 2 + 1)):
        pattern = temiz[:blok]
        tekrar = temiz.count(pattern)
        if tekrar >= 4 and tekrar * blok >= n_temiz * 0.7:
            return True

    return False