Spaces:

moztrk
/

sentinel-api

Runtime error

App Files Files Community

Mustafa Öztürk commited on Mar 10

Commit

9628b21

1 Parent(s): 78509a9

Improve spam heuristics and reduce short-token false positives

Browse files

Files changed (1) hide show

app/utils/text_utils.py +24 -1

app/utils/text_utils.py CHANGED Viewed

@@ -1,5 +1,6 @@
 import re
 import unicodedata
 def _merge_spaced_letter_chains(text: str) -> str:
@@ -60,6 +61,18 @@ def check_blacklist(text: str, blacklist_set: set) -> bool:
 def is_spam(temiz: str, dil: str = "tr") -> bool:
     sadece_harf = re.sub(r'[^a-zğüşıöç]', '', temiz)
     n = len(sadece_harf)
@@ -74,7 +87,8 @@ def is_spam(temiz: str, dil: str = "tr") -> bool:
     if dil == "tr":
         tr_olmayan = set('wqx')
         tr_olmayan_oran = sum(1 for c in sadece_harf if c in tr_olmayan) / max(n, 1)
-        if tr_olmayan_oran > 0.2:
             return True
     unique_chars = len(set(sadece_harf))
@@ -88,6 +102,15 @@ def is_spam(temiz: str, dil: str = "tr") -> bool:
     if re.search(r'(.)\1{6,}', temiz):
         return True
     n_temiz = len(temiz)
     for blok in range(3, min(10, n_temiz // 2 + 1)):
         pattern = temiz[:blok]

 import re
 import unicodedata
+from collections import Counter
 def _merge_spaced_letter_chains(text: str) -> str:
 def is_spam(temiz: str, dil: str = "tr") -> bool:
+    tokens = [t for t in temiz.split() if t]
+    token_count = len(tokens)
+    # Repeated token floods are a common low-effort spam pattern.
+    if token_count >= 4:
+        most_common_count = Counter(tokens).most_common(1)[0][1]
+        if most_common_count / token_count >= 0.6:
+            return True
+    if re.search(r"(https?://|www\.|t\.me/|bit\.ly|discord\.gg)", temiz):
+        return True
     sadece_harf = re.sub(r'[^a-zğüşıöç]', '', temiz)
     n = len(sadece_harf)
     if dil == "tr":
         tr_olmayan = set('wqx')
         tr_olmayan_oran = sum(1 for c in sadece_harf if c in tr_olmayan) / max(n, 1)
+        # Avoid penalizing very short slang-like tokens (e.g., "amq").
+        if n >= 8 and tr_olmayan_oran > 0.2:
             return True
     unique_chars = len(set(sadece_harf))
     if re.search(r'(.)\1{6,}', temiz):
         return True
+    # If one token dominates the full message, it is usually copy-paste spam.
+    if token_count >= 5:
+        normalized_tokens = [re.sub(r'[^a-zğüşıöç0-9]', '', t) for t in tokens]
+        normalized_tokens = [t for t in normalized_tokens if t]
+        if normalized_tokens:
+            top_norm_count = Counter(normalized_tokens).most_common(1)[0][1]
+            if top_norm_count / len(normalized_tokens) >= 0.7:
+                return True
     n_temiz = len(temiz)
     for blok in range(3, min(10, n_temiz // 2 + 1)):
         pattern = temiz[:blok]