Mustafa Öztürk commited on
Commit
27e66da
·
1 Parent(s): 1bbaeae

Add raw repeat spam check with threshold 6

Browse files
app/services/moderation_service.py CHANGED
@@ -264,7 +264,7 @@ def run_moderation(text: str, platform_dil: str = "tr"):
264
  temiz = clean_text_nfkc(text)
265
  dil = "en" if platform_dil == "en" else "tr"
266
 
267
- if is_spam(temiz, dil):
268
  early_exit_count += 1
269
  _log_pipeline_counts(early_exit_count, detoxify_call_count)
270
  return (
@@ -352,7 +352,7 @@ def run_moderation_batch(texts, platform_dil: str = "tr", batch_size: int = 8):
352
  for idx, text in enumerate(texts):
353
  temiz = clean_text_nfkc(text)
354
 
355
- if is_spam(temiz, dil):
356
  early_exit_count += 1
357
  results[idx] = (
358
  "🗑️ SPAM/GİBBERİSH",
 
264
  temiz = clean_text_nfkc(text)
265
  dil = "en" if platform_dil == "en" else "tr"
266
 
267
+ if is_spam(temiz, dil, text):
268
  early_exit_count += 1
269
  _log_pipeline_counts(early_exit_count, detoxify_call_count)
270
  return (
 
352
  for idx, text in enumerate(texts):
353
  temiz = clean_text_nfkc(text)
354
 
355
+ if is_spam(temiz, dil, text):
356
  early_exit_count += 1
357
  results[idx] = (
358
  "🗑️ SPAM/GİBBERİSH",
app/utils/text_utils.py CHANGED
@@ -59,7 +59,13 @@ def check_blacklist(text: str, blacklist_set: set) -> bool:
59
  return bool(set(text.split()) & blacklist_set)
60
 
61
 
62
- def is_spam(temiz: str, dil: str = "tr") -> bool:
 
 
 
 
 
 
63
  sadece_harf = re.sub(r'[^a-zğüşıöç]', '', temiz)
64
  n = len(sadece_harf)
65
 
 
59
  return bool(set(text.split()) & blacklist_set)
60
 
61
 
62
+ def is_spam(temiz: str, dil: str = "tr", ham_metin: str = "") -> bool:
63
+ # Raw-text check protects against normalization hiding exaggerated repeats.
64
+ raw_text = str(ham_metin) if ham_metin else temiz
65
+ raw_tokens = [t for t in raw_text.split() if t]
66
+ if len(raw_tokens) == 1 and re.search(r'(.)\1{5,}', raw_text.lower()):
67
+ return True
68
+
69
  sadece_harf = re.sub(r'[^a-zğüşıöç]', '', temiz)
70
  n = len(sadece_harf)
71