Spaces:
Sleeping
Sleeping
| # ============================================================ | |
| # FILE : test_profanity.py | |
| # FUNGSI: Unit test detektor kata kasar | |
| # AUTHOR: Ariel Jonathan | |
| # ============================================================ | |
| from __future__ import annotations | |
| import pytest | |
| from profanity.profanity_detector import ( | |
| ProfanityChecker, | |
| _consonant_skeleton, | |
| _normalize, | |
| ) | |
| def checker(): | |
| chk = ProfanityChecker() | |
| chk.load() | |
| return chk | |
| class _FakePipe: | |
| """Pipeline tiruan meniru keluaran text-classification toksisitas.""" | |
| def __init__(self, label: str, score: float) -> None: | |
| self._label, self._score = label, score | |
| def __call__(self, text, **kwargs): | |
| return [{"label": self._label, "score": self._score}] | |
| class TestLayer2Toksisitas: | |
| """Layer 2 ML opsional — diuji dengan pipe tiruan (tanpa unduh model).""" | |
| def _chk(self, label, score): | |
| chk = ProfanityChecker(use_ml=False) | |
| chk.load() | |
| chk._ml_pipe = _FakePipe(label, score) # suntik pipe tiruan | |
| return chk | |
| def test_teks_toksik_ditandai_layer_ml(self): | |
| chk = self._chk("HateSpeech", 0.93) | |
| out = chk.check("kamu benar benar menyebalkan sekali hari ini") | |
| assert out and out[0].layer == "ml" | |
| def test_label_negasi_tidak_ditandai(self): | |
| # "Non_HateSpeech" mengandung "HATE" tetapi negatif → tidak toksik. | |
| chk = self._chk("Non_HateSpeech", 0.99) | |
| assert chk.check("kamu benar benar menyebalkan sekali hari ini") == [] | |
| def test_skor_rendah_tidak_ditandai(self): | |
| chk = self._chk("HateSpeech", 0.40) | |
| assert chk.check("kamu benar benar menyebalkan sekali hari ini") == [] | |
| def test_leksikon_menang_atas_ml(self): | |
| # Bila Layer 1 menemukan, Layer 2 tidak dijalankan (tidak ada temuan ml). | |
| chk = self._chk("HateSpeech", 0.99) | |
| out = chk.check("dasar anjing kamu ini") | |
| assert out and all(f.layer == "lexicon" for f in out) | |
| # ============================================================ | |
| # Fungsi normalisasi (fungsi murni) | |
| # ============================================================ | |
| class TestNormalize: | |
| def test_leetspeak_dikonversi(self): | |
| assert _normalize("4nj1ng") == "anjing" | |
| def test_karakter_berulang_dirapikan(self): | |
| assert _normalize("anjiiing") == "anjing" | |
| def test_huruf_besar_menjadi_kecil(self): | |
| assert _normalize("GOBLOK") == "goblok" | |
| class TestConsonantSkeleton: | |
| def test_vokal_dihapus(self): | |
| assert _consonant_skeleton("anjing") == "njng" | |
| def test_kata_tanpa_vokal_tetap(self): | |
| assert _consonant_skeleton("bngst") == "bngst" | |
| # ============================================================ | |
| # Deteksi end-to-end | |
| # ============================================================ | |
| class TestDeteksi: | |
| def test_umpatan_langsung_terdeteksi(self, checker): | |
| words = {f.normalized for f in checker.check("dasar anjing goblok")} | |
| assert {"anjing", "goblok"} <= words | |
| def test_kata_vulgar_severity_high(self, checker): | |
| findings = checker.check("kontol itu kata vulgar") | |
| assert findings and findings[0].severity == "HIGH" | |
| def test_singkatan_tanpa_vokal_terdeteksi(self, checker): | |
| findings = checker.check("anjng bngst") | |
| assert len(findings) == 2 | |
| def test_leetspeak_terdeteksi(self, checker): | |
| findings = checker.check("4nj1ng lu") | |
| assert findings and findings[0].normalized == "anjing" | |
| def test_teks_bersih_tanpa_temuan(self, checker): | |
| assert checker.check("Tolong jelaskan materi fotosintesis") == [] | |
| class TestRegresiKataMedisReligius: | |
| """Regresi B5: kata medis dibersihkan dari SUMBER lexicon (bukan whitelist). | |
| Catatan: setelah whitelist dihapus, kata religius/budaya yang masih ada di | |
| lexicon eksternal (mis. "setan", "iblis") kembali ditandai sebagai kata kasar. | |
| Itu konsekuensi yang diterima dari penghapusan whitelist. | |
| """ | |
| def test_payudara_konteks_kesehatan_bersih(self, checker): | |
| text = "jelaskan deteksi dini kanker payudara untuk siswa SMA" | |
| assert checker.check(text) == [] | |