# ============================================================ # FILE : test_profanity.py # FUNGSI: Unit test detektor kata kasar # AUTHOR: Ariel Jonathan # ============================================================ from __future__ import annotations import pytest from profanity.profanity_detector import ( ProfanityChecker, _consonant_skeleton, _normalize, ) @pytest.fixture(scope="module") def checker(): chk = ProfanityChecker() chk.load() return chk class _FakePipe: """Pipeline tiruan meniru keluaran text-classification toksisitas.""" def __init__(self, label: str, score: float) -> None: self._label, self._score = label, score def __call__(self, text, **kwargs): return [{"label": self._label, "score": self._score}] class TestLayer2Toksisitas: """Layer 2 ML opsional — diuji dengan pipe tiruan (tanpa unduh model).""" def _chk(self, label, score): chk = ProfanityChecker(use_ml=False) chk.load() chk._ml_pipe = _FakePipe(label, score) # suntik pipe tiruan return chk def test_teks_toksik_ditandai_layer_ml(self): chk = self._chk("HateSpeech", 0.93) out = chk.check("kamu benar benar menyebalkan sekali hari ini") assert out and out[0].layer == "ml" def test_label_negasi_tidak_ditandai(self): # "Non_HateSpeech" mengandung "HATE" tetapi negatif → tidak toksik. chk = self._chk("Non_HateSpeech", 0.99) assert chk.check("kamu benar benar menyebalkan sekali hari ini") == [] def test_skor_rendah_tidak_ditandai(self): chk = self._chk("HateSpeech", 0.40) assert chk.check("kamu benar benar menyebalkan sekali hari ini") == [] def test_leksikon_menang_atas_ml(self): # Bila Layer 1 menemukan, Layer 2 tidak dijalankan (tidak ada temuan ml). chk = self._chk("HateSpeech", 0.99) out = chk.check("dasar anjing kamu ini") assert out and all(f.layer == "lexicon" for f in out) # ============================================================ # Fungsi normalisasi (fungsi murni) # ============================================================ class TestNormalize: def test_leetspeak_dikonversi(self): assert _normalize("4nj1ng") == "anjing" def test_karakter_berulang_dirapikan(self): assert _normalize("anjiiing") == "anjing" def test_huruf_besar_menjadi_kecil(self): assert _normalize("GOBLOK") == "goblok" class TestConsonantSkeleton: def test_vokal_dihapus(self): assert _consonant_skeleton("anjing") == "njng" def test_kata_tanpa_vokal_tetap(self): assert _consonant_skeleton("bngst") == "bngst" # ============================================================ # Deteksi end-to-end # ============================================================ class TestDeteksi: def test_umpatan_langsung_terdeteksi(self, checker): words = {f.normalized for f in checker.check("dasar anjing goblok")} assert {"anjing", "goblok"} <= words def test_kata_vulgar_severity_high(self, checker): findings = checker.check("kontol itu kata vulgar") assert findings and findings[0].severity == "HIGH" def test_singkatan_tanpa_vokal_terdeteksi(self, checker): findings = checker.check("anjng bngst") assert len(findings) == 2 def test_leetspeak_terdeteksi(self, checker): findings = checker.check("4nj1ng lu") assert findings and findings[0].normalized == "anjing" def test_teks_bersih_tanpa_temuan(self, checker): assert checker.check("Tolong jelaskan materi fotosintesis") == [] class TestRegresiKataMedisReligius: """Regresi B5: kata medis dibersihkan dari SUMBER lexicon (bukan whitelist). Catatan: setelah whitelist dihapus, kata religius/budaya yang masih ada di lexicon eksternal (mis. "setan", "iblis") kembali ditandai sebagai kata kasar. Itu konsekuensi yang diterima dari penghapusan whitelist. """ def test_payudara_konteks_kesehatan_bersih(self, checker): text = "jelaskan deteksi dini kanker payudara untuk siswa SMA" assert checker.check(text) == []