Spaces:
Runtime error
Runtime error
Mustafa Öztürk commited on
Commit ·
27e66da
1
Parent(s): 1bbaeae
Add raw repeat spam check with threshold 6
Browse files
app/services/moderation_service.py
CHANGED
|
@@ -264,7 +264,7 @@ def run_moderation(text: str, platform_dil: str = "tr"):
|
|
| 264 |
temiz = clean_text_nfkc(text)
|
| 265 |
dil = "en" if platform_dil == "en" else "tr"
|
| 266 |
|
| 267 |
-
if is_spam(temiz, dil):
|
| 268 |
early_exit_count += 1
|
| 269 |
_log_pipeline_counts(early_exit_count, detoxify_call_count)
|
| 270 |
return (
|
|
@@ -352,7 +352,7 @@ def run_moderation_batch(texts, platform_dil: str = "tr", batch_size: int = 8):
|
|
| 352 |
for idx, text in enumerate(texts):
|
| 353 |
temiz = clean_text_nfkc(text)
|
| 354 |
|
| 355 |
-
if is_spam(temiz, dil):
|
| 356 |
early_exit_count += 1
|
| 357 |
results[idx] = (
|
| 358 |
"🗑️ SPAM/GİBBERİSH",
|
|
|
|
| 264 |
temiz = clean_text_nfkc(text)
|
| 265 |
dil = "en" if platform_dil == "en" else "tr"
|
| 266 |
|
| 267 |
+
if is_spam(temiz, dil, text):
|
| 268 |
early_exit_count += 1
|
| 269 |
_log_pipeline_counts(early_exit_count, detoxify_call_count)
|
| 270 |
return (
|
|
|
|
| 352 |
for idx, text in enumerate(texts):
|
| 353 |
temiz = clean_text_nfkc(text)
|
| 354 |
|
| 355 |
+
if is_spam(temiz, dil, text):
|
| 356 |
early_exit_count += 1
|
| 357 |
results[idx] = (
|
| 358 |
"🗑️ SPAM/GİBBERİSH",
|
app/utils/text_utils.py
CHANGED
|
@@ -59,7 +59,13 @@ def check_blacklist(text: str, blacklist_set: set) -> bool:
|
|
| 59 |
return bool(set(text.split()) & blacklist_set)
|
| 60 |
|
| 61 |
|
| 62 |
-
def is_spam(temiz: str, dil: str = "tr") -> bool:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 63 |
sadece_harf = re.sub(r'[^a-zğüşıöç]', '', temiz)
|
| 64 |
n = len(sadece_harf)
|
| 65 |
|
|
|
|
| 59 |
return bool(set(text.split()) & blacklist_set)
|
| 60 |
|
| 61 |
|
| 62 |
+
def is_spam(temiz: str, dil: str = "tr", ham_metin: str = "") -> bool:
|
| 63 |
+
# Raw-text check protects against normalization hiding exaggerated repeats.
|
| 64 |
+
raw_text = str(ham_metin) if ham_metin else temiz
|
| 65 |
+
raw_tokens = [t for t in raw_text.split() if t]
|
| 66 |
+
if len(raw_tokens) == 1 and re.search(r'(.)\1{5,}', raw_text.lower()):
|
| 67 |
+
return True
|
| 68 |
+
|
| 69 |
sadece_harf = re.sub(r'[^a-zğüşıöç]', '', temiz)
|
| 70 |
n = len(sadece_harf)
|
| 71 |
|