Prompt-Builder / tests /test_profanity.py
ArielJoe's picture
feat: cross-detector language policy + tidy structure & file naming
5ddfd1f
Raw
History Blame Contribute Delete
4.18 kB
# ============================================================
# FILE : test_profanity.py
# FUNGSI: Unit test detektor kata kasar
# AUTHOR: Ariel Jonathan
# ============================================================
from __future__ import annotations
import pytest
from profanity.profanity_detector import (
ProfanityChecker,
_consonant_skeleton,
_normalize,
)
@pytest.fixture(scope="module")
def checker():
chk = ProfanityChecker()
chk.load()
return chk
class _FakePipe:
"""Pipeline tiruan meniru keluaran text-classification toksisitas."""
def __init__(self, label: str, score: float) -> None:
self._label, self._score = label, score
def __call__(self, text, **kwargs):
return [{"label": self._label, "score": self._score}]
class TestLayer2Toksisitas:
"""Layer 2 ML opsional — diuji dengan pipe tiruan (tanpa unduh model)."""
def _chk(self, label, score):
chk = ProfanityChecker(use_ml=False)
chk.load()
chk._ml_pipe = _FakePipe(label, score) # suntik pipe tiruan
return chk
def test_teks_toksik_ditandai_layer_ml(self):
chk = self._chk("HateSpeech", 0.93)
out = chk.check("kamu benar benar menyebalkan sekali hari ini")
assert out and out[0].layer == "ml"
def test_label_negasi_tidak_ditandai(self):
# "Non_HateSpeech" mengandung "HATE" tetapi negatif → tidak toksik.
chk = self._chk("Non_HateSpeech", 0.99)
assert chk.check("kamu benar benar menyebalkan sekali hari ini") == []
def test_skor_rendah_tidak_ditandai(self):
chk = self._chk("HateSpeech", 0.40)
assert chk.check("kamu benar benar menyebalkan sekali hari ini") == []
def test_leksikon_menang_atas_ml(self):
# Bila Layer 1 menemukan, Layer 2 tidak dijalankan (tidak ada temuan ml).
chk = self._chk("HateSpeech", 0.99)
out = chk.check("dasar anjing kamu ini")
assert out and all(f.layer == "lexicon" for f in out)
# ============================================================
# Fungsi normalisasi (fungsi murni)
# ============================================================
class TestNormalize:
def test_leetspeak_dikonversi(self):
assert _normalize("4nj1ng") == "anjing"
def test_karakter_berulang_dirapikan(self):
assert _normalize("anjiiing") == "anjing"
def test_huruf_besar_menjadi_kecil(self):
assert _normalize("GOBLOK") == "goblok"
class TestConsonantSkeleton:
def test_vokal_dihapus(self):
assert _consonant_skeleton("anjing") == "njng"
def test_kata_tanpa_vokal_tetap(self):
assert _consonant_skeleton("bngst") == "bngst"
# ============================================================
# Deteksi end-to-end
# ============================================================
class TestDeteksi:
def test_umpatan_langsung_terdeteksi(self, checker):
words = {f.normalized for f in checker.check("dasar anjing goblok")}
assert {"anjing", "goblok"} <= words
def test_kata_vulgar_severity_high(self, checker):
findings = checker.check("kontol itu kata vulgar")
assert findings and findings[0].severity == "HIGH"
def test_singkatan_tanpa_vokal_terdeteksi(self, checker):
findings = checker.check("anjng bngst")
assert len(findings) == 2
def test_leetspeak_terdeteksi(self, checker):
findings = checker.check("4nj1ng lu")
assert findings and findings[0].normalized == "anjing"
def test_teks_bersih_tanpa_temuan(self, checker):
assert checker.check("Tolong jelaskan materi fotosintesis") == []
class TestRegresiKataMedisReligius:
"""Regresi B5: kata medis dibersihkan dari SUMBER lexicon (bukan whitelist).
Catatan: setelah whitelist dihapus, kata religius/budaya yang masih ada di
lexicon eksternal (mis. "setan", "iblis") kembali ditandai sebagai kata kasar.
Itu konsekuensi yang diterima dari penghapusan whitelist.
"""
def test_payudara_konteks_kesehatan_bersih(self, checker):
text = "jelaskan deteksi dini kanker payudara untuk siswa SMA"
assert checker.check(text) == []