Mustafa Öztürk commited on
Commit
9628b21
·
1 Parent(s): 78509a9

Improve spam heuristics and reduce short-token false positives

Browse files
Files changed (1) hide show
  1. app/utils/text_utils.py +24 -1
app/utils/text_utils.py CHANGED
@@ -1,5 +1,6 @@
1
  import re
2
  import unicodedata
 
3
 
4
 
5
  def _merge_spaced_letter_chains(text: str) -> str:
@@ -60,6 +61,18 @@ def check_blacklist(text: str, blacklist_set: set) -> bool:
60
 
61
 
62
  def is_spam(temiz: str, dil: str = "tr") -> bool:
 
 
 
 
 
 
 
 
 
 
 
 
63
  sadece_harf = re.sub(r'[^a-zğüşıöç]', '', temiz)
64
  n = len(sadece_harf)
65
 
@@ -74,7 +87,8 @@ def is_spam(temiz: str, dil: str = "tr") -> bool:
74
  if dil == "tr":
75
  tr_olmayan = set('wqx')
76
  tr_olmayan_oran = sum(1 for c in sadece_harf if c in tr_olmayan) / max(n, 1)
77
- if tr_olmayan_oran > 0.2:
 
78
  return True
79
 
80
  unique_chars = len(set(sadece_harf))
@@ -88,6 +102,15 @@ def is_spam(temiz: str, dil: str = "tr") -> bool:
88
  if re.search(r'(.)\1{6,}', temiz):
89
  return True
90
 
 
 
 
 
 
 
 
 
 
91
  n_temiz = len(temiz)
92
  for blok in range(3, min(10, n_temiz // 2 + 1)):
93
  pattern = temiz[:blok]
 
1
  import re
2
  import unicodedata
3
+ from collections import Counter
4
 
5
 
6
  def _merge_spaced_letter_chains(text: str) -> str:
 
61
 
62
 
63
  def is_spam(temiz: str, dil: str = "tr") -> bool:
64
+ tokens = [t for t in temiz.split() if t]
65
+ token_count = len(tokens)
66
+
67
+ # Repeated token floods are a common low-effort spam pattern.
68
+ if token_count >= 4:
69
+ most_common_count = Counter(tokens).most_common(1)[0][1]
70
+ if most_common_count / token_count >= 0.6:
71
+ return True
72
+
73
+ if re.search(r"(https?://|www\.|t\.me/|bit\.ly|discord\.gg)", temiz):
74
+ return True
75
+
76
  sadece_harf = re.sub(r'[^a-zğüşıöç]', '', temiz)
77
  n = len(sadece_harf)
78
 
 
87
  if dil == "tr":
88
  tr_olmayan = set('wqx')
89
  tr_olmayan_oran = sum(1 for c in sadece_harf if c in tr_olmayan) / max(n, 1)
90
+ # Avoid penalizing very short slang-like tokens (e.g., "amq").
91
+ if n >= 8 and tr_olmayan_oran > 0.2:
92
  return True
93
 
94
  unique_chars = len(set(sadece_harf))
 
102
  if re.search(r'(.)\1{6,}', temiz):
103
  return True
104
 
105
+ # If one token dominates the full message, it is usually copy-paste spam.
106
+ if token_count >= 5:
107
+ normalized_tokens = [re.sub(r'[^a-zğüşıöç0-9]', '', t) for t in tokens]
108
+ normalized_tokens = [t for t in normalized_tokens if t]
109
+ if normalized_tokens:
110
+ top_norm_count = Counter(normalized_tokens).most_common(1)[0][1]
111
+ if top_norm_count / len(normalized_tokens) >= 0.7:
112
+ return True
113
+
114
  n_temiz = len(temiz)
115
  for blok in range(3, min(10, n_temiz // 2 + 1)):
116
  pattern = temiz[:blok]