Spaces:

moztrk
/

sentinel-api

Runtime error

Mustafa Öztürk commited on Mar 10

Commit

27e66da

1 Parent(s): 1bbaeae

Add raw repeat spam check with threshold 6

Files changed (2) hide show

app/services/moderation_service.py CHANGED Viewed

@@ -264,7 +264,7 @@ def run_moderation(text: str, platform_dil: str = "tr"):
     temiz = clean_text_nfkc(text)
     dil = "en" if platform_dil == "en" else "tr"
-    if is_spam(temiz, dil):
         early_exit_count += 1
         _log_pipeline_counts(early_exit_count, detoxify_call_count)
         return (
@@ -352,7 +352,7 @@ def run_moderation_batch(texts, platform_dil: str = "tr", batch_size: int = 8):
     for idx, text in enumerate(texts):
         temiz = clean_text_nfkc(text)
-        if is_spam(temiz, dil):
             early_exit_count += 1
             results[idx] = (
                 "🗑️ SPAM/GİBBERİSH",

     temiz = clean_text_nfkc(text)
     dil = "en" if platform_dil == "en" else "tr"
+    if is_spam(temiz, dil, text):
         early_exit_count += 1
         _log_pipeline_counts(early_exit_count, detoxify_call_count)
         return (
     for idx, text in enumerate(texts):
         temiz = clean_text_nfkc(text)
+        if is_spam(temiz, dil, text):
             early_exit_count += 1
             results[idx] = (
                 "🗑️ SPAM/GİBBERİSH",

app/utils/text_utils.py CHANGED Viewed

@@ -59,7 +59,13 @@ def check_blacklist(text: str, blacklist_set: set) -> bool:
     return bool(set(text.split()) & blacklist_set)
-def is_spam(temiz: str, dil: str = "tr") -> bool:
     sadece_harf = re.sub(r'[^a-zğüşıöç]', '', temiz)
     n = len(sadece_harf)

     return bool(set(text.split()) & blacklist_set)
+def is_spam(temiz: str, dil: str = "tr", ham_metin: str = "") -> bool:
+    # Raw-text check protects against normalization hiding exaggerated repeats.
+    raw_text = str(ham_metin) if ham_metin else temiz
+    raw_tokens = [t for t in raw_text.split() if t]
+    if len(raw_tokens) == 1 and re.search(r'(.)\1{5,}', raw_text.lower()):
+        return True
     sadece_harf = re.sub(r'[^a-zğüşıöç]', '', temiz)
     n = len(sadece_harf)