Spaces:
Sleeping
Sleeping
| """ | |
| Profanity Checker — Deteksi kata kasar dan tidak pantas Bahasa Indonesia. | |
| Mendeteksi dua tingkat keparahan: | |
| HIGH Kata vulgar/seksual eksplisit yang hampir selalu tidak pantas | |
| dalam konteks apapun (termasuk pendidikan). | |
| Contoh: kontol, memek, entot | |
| MEDIUM Umpatan, makian, atau kata merendahkan yang kontekstual namun | |
| umumnya tidak pantas dalam teks formal atau pendidikan. | |
| Contoh: anjing, bangsat, goblok | |
| Strategi deteksi (berlapis untuk menangkap variasi penulisan): | |
| 1. Normalisasi: collapse karakter berulang (anjiiing → anjing), | |
| konversi leet-speak (4→a, 3→e, 1→i, 0→o, $→s) | |
| 2. Lookup langsung di lexicon gabungan dari resources/lexicons/profanity/ | |
| 3. Strip digit suffix: babi2 → babi (pola reduplikasi) | |
| 4. Skeleton konsonan: hapus vokal, cocokkan struktur konsonan | |
| → anjng (skeleton: njng) → cocok anjing (skeleton: njng) | |
| → bngst (skeleton: bngst) → cocok bangsat (skeleton: bngst) | |
| Sumber lexicon curated/import: | |
| okkyibrohim et al. — id-multi-label-hate-speech (abusive.csv) | |
| LDNOOBW — multilingual profanity list | |
| Update online dilakukan lewat scripts/import_lexicons.py, bukan saat startup. | |
| Referensi: | |
| okkyibrohim et al. (2019). Multi-label hate speech & abusive language | |
| detection on Indonesian Twitter. EMNLP-IJCNLP 2019. | |
| """ | |
| from __future__ import annotations | |
| import logging | |
| import re | |
| from dataclasses import dataclass | |
| from typing import Literal | |
| from core.lexicons import load_word_set | |
| logger = logging.getLogger(__name__) | |
| # Layer 2 — Classifier Toksisitas Pre-trained (opsional) | |
| # | |
| # Model fine-tuned Bahasa Indonesia (mis. IndoBERTweet hate-speech) sebagai jaring | |
| # kedua untuk menangkap ujaran kasar/toksik yang lolos leksikon. Layer 1 (leksikon) | |
| # tetap utama. Untuk mengganti model, ubah konstanta atau PROFANITY_ML_MODEL. | |
| import threading | |
| from pathlib import Path as _Path | |
| try: | |
| from core import config as _config | |
| _ML_MODEL = getattr(_config, "PROFANITY_ML_MODEL", "Exqrch/IndoBERTweet-HateSpeech") | |
| _ML_THRESHOLD = getattr(_config, "PROFANITY_ML_THRESHOLD", 0.85) | |
| except Exception: # pragma: no cover | |
| _ML_MODEL, _ML_THRESHOLD = "Exqrch/IndoBERTweet-HateSpeech", 0.85 | |
| _ML_DIR = _Path(__file__).parent.parent.parent / "cache" / "profanity-toxicity-id" | |
| # Label keluaran classifier yang dianggap toksik (robust antar penamaan model). | |
| _ML_TOXIC_LABELS = {"HATE", "HATESPEECH", "HATE_SPEECH", "TOXIC", "OFFENSIVE", "ABUSIVE", "LABEL_1"} | |
| _ML_TOXIC_SUBSTR = ("HATE", "TOXIC", "ABUS", "OFFENS") | |
| try: | |
| from wordfreq import word_frequency | |
| _WORDFREQ_OK = True | |
| except ImportError: | |
| _WORDFREQ_OK = False | |
| # Konstanta & Konfigurasi | |
| # Tabel konversi leet-speak → huruf normal | |
| # Hanya karakter yang wajar muncul di dalam kata (bukan simbol matematis) | |
| _LEET: dict[str, str] = { | |
| "4": "a", "3": "e", "1": "i", "0": "o", | |
| "@": "a", "$": "s", "5": "s", "7": "t", | |
| "9": "g", "8": "b", | |
| } | |
| _LEET_TABLE = str.maketrans(_LEET) | |
| _WORDFREQ_MIN = 1e-8 | |
| # Kata Bawaan (Fallback & Klasifikasi Severity) | |
| # | |
| # Digunakan jika file lexicon tidak tersedia, dan sebagai penentu severity | |
| # untuk kata-kata yang ada di lexicon eksternal. | |
| # HIGH — kata vulgar/seksual eksplisit (tidak pantas dalam konteks apapun) | |
| _BUILTIN_HIGH: set[str] = load_word_set("profanity", "high_fallback.txt") or set() | |
| # MEDIUM — umpatan, makian, kata merendahkan (tidak pantas di teks formal/pendidikan) | |
| _BUILTIN_MEDIUM: set[str] = load_word_set("profanity", "medium_fallback.txt") or set() | |
| # Tipe Data | |
| SeverityLevel = Literal["HIGH", "MEDIUM"] | |
| class ProfanityFinding: | |
| """Satu temuan kata kasar dalam teks.""" | |
| word: str # kata asli dari teks input | |
| normalized: str # bentuk setelah normalisasi (yang cocok di lexicon) | |
| start: int # offset karakter awal | |
| end: int # offset karakter akhir | |
| severity: SeverityLevel # "HIGH" (vulgar) atau "MEDIUM" (umpatan) | |
| reason: str # penjelasan singkat untuk pengguna | |
| confidence: float # skor kepercayaan 0.0–1.0 | |
| layer: str = "lexicon" # "lexicon" (Layer 1) | "ml" (Layer 2 toksisitas) | |
| # Loader & Normalisasi | |
| def _load_lexicon() -> tuple[set[str], set[str]]: | |
| """ | |
| Muat lexicon profanity dan kembalikan (high_words, medium_words). | |
| Urutan prioritas: | |
| 1. Baca file lokal di resources/lexicons/profanity/ | |
| 2. Jika file tidak tersedia, gunakan fallback curated di modul ini | |
| 3. Gabungkan external_id sebagai MEDIUM kecuali ada di HIGH | |
| Returns: | |
| (high_words, medium_words) — dua set kata kasar Indonesia. | |
| """ | |
| high = load_word_set("profanity", "high.txt") or set(_BUILTIN_HIGH) | |
| medium = load_word_set("profanity", "medium.txt") or set(_BUILTIN_MEDIUM) | |
| external = load_word_set("profanity", "external_id.txt") | |
| # Kata kasar Inggris (lintas bahasa) — PII/NER/Profanity berlaku untuk teks | |
| # Inggris juga. Dicocokkan sebagai token utuh sehingga aman dari tabrakan. | |
| high |= load_word_set("profanity", "high_en.txt") | |
| medium |= load_word_set("profanity", "medium_en.txt") | |
| all_words = external | high | medium | |
| # Kata yang tidak ada di HIGH → masukkan ke MEDIUM | |
| medium |= all_words - high | |
| logger.info("Lexicon dimuat: HIGH=%d kata, MEDIUM=%d kata.", len(high), len(medium)) | |
| return high, medium | |
| def _normalize(text: str) -> str: | |
| """ | |
| Normalisasi teks sebelum pencocokan: | |
| 1. Ubah ke huruf kecil | |
| 2. Konversi leet-speak: 4→a, 3→e, 1→i, 0→o, @→a, $→s | |
| 3. Collapse karakter berulang: anjiiing → anjing | |
| """ | |
| text = text.lower().translate(_LEET_TABLE) | |
| text = re.sub(r"(.)\1{2,}", r"\1", text) | |
| return text | |
| def _all_variants(raw: str) -> list[str]: | |
| """ | |
| Kembalikan semua varian normalisasi yang perlu dicek. | |
| Varian yang dibuat: | |
| - Normalisasi dasar (leet + collapse repeat) | |
| - Strip satu digit suffix: "babi2" → "babi" (pola reduplikasi alay) | |
| """ | |
| norm = _normalize(raw) | |
| variants = [norm] | |
| # Strip satu digit di akhir (mis. reduplikasi "babi2" → "babi") | |
| if norm and norm[-1].isdigit(): | |
| variants.append(norm[:-1]) | |
| return variants | |
| _SPACED_LETTERS = re.compile(r"\b(?:[a-zA-Z][\s.\-_]+){2,}[a-zA-Z]\b") | |
| def _consonant_skeleton(word: str) -> str: | |
| """ | |
| Hapus semua vokal dari kata untuk menghasilkan skeleton konsonan. | |
| Digunakan untuk mencocokkan bentuk singkatan: anjng → njng (cocok anjing: njng). | |
| """ | |
| return re.sub(r"[aeiou]", "", word) | |
| def _is_known_common_word(word: str, language: str = "id") -> bool: | |
| """True jika token dikenal sebagai kata umum sehingga skeleton tidak dipakai.""" | |
| if not _WORDFREQ_OK: | |
| return False | |
| return word_frequency(word, "id") >= _WORDFREQ_MIN | |
| def _build_skeleton_index(words: set[str]) -> dict[str, list[str]]: | |
| """ | |
| Buat index skeleton konsonan → daftar kata profanity untuk deteksi singkatan. | |
| Hanya kata dengan ≥ 4 huruf yang diindeks (mengurangi false positive pada | |
| singkatan umum non-profanity yang kebetulan memiliki skeleton sama). | |
| Beberapa kata berbeda bisa berbagi skeleton yang sama (mis. "bangsat" dan | |
| "bangset" → "bngst"). Karena itu nilainya berupa LIST, bukan satu kata — | |
| mencegah entri saling menimpa yang dapat menyebabkan false negative. | |
| Returns: | |
| dict {skeleton: [kata_asli, ...]} | |
| """ | |
| idx: dict[str, list[str]] = {} | |
| for w in words: | |
| if len(w) >= 4: | |
| skel = _consonant_skeleton(w) | |
| if skel and len(skel) >= 3: | |
| idx.setdefault(skel, []).append(w) | |
| return idx | |
| def _match_skeleton(candidates: list[str] | None, norm_len: int) -> str | None: | |
| """ | |
| Pilih kata profanity yang cocok untuk sebuah skeleton. | |
| Mengembalikan kata terpanjang yang memenuhi syarat panjang (bentuk yang | |
| dicek tidak lebih panjang dari kata asli di lexicon — menghindari false | |
| positive pada kata panjang), atau None jika tidak ada yang memenuhi. | |
| """ | |
| if not candidates: | |
| return None | |
| best = max(candidates, key=len) | |
| return best if norm_len <= len(best) else None | |
| # Kelas Utama | |
| class ProfanityChecker: | |
| """ | |
| Detektor kata kasar Bahasa Indonesia berbasis lexicon. | |
| Mendeteksi berbagai variasi penulisan: bentuk normal, leet-speak, | |
| karakter berulang, digit suffix, dan singkatan tanpa vokal. | |
| Contoh penggunaan:: | |
| chk = ProfanityChecker() | |
| chk.load() | |
| for f in chk.check("anjing lu goblok"): | |
| print(f.word, f.severity, f.normalized) | |
| """ | |
| def __init__(self, use_ml: bool = True) -> None: | |
| self._high: set[str] = set() | |
| self._medium: set[str] = set() | |
| self._skel_high: dict[str, list[str]] = {} # skeleton → daftar kata HIGH | |
| self._skel_medium: dict[str, list[str]] = {} # skeleton → daftar kata MEDIUM | |
| self._use_ml = use_ml | |
| self._ml_pipe = None | |
| self._ml_lock = threading.Lock() | |
| self._loaded = False | |
| # Public API | |
| def load(self) -> bool: | |
| """ | |
| Muat lexicon (Layer 1) dan, bila diaktifkan, model toksisitas (Layer 2). | |
| Returns: | |
| True setelah lexicon berhasil dimuat (Layer 1 selalu tersedia). | |
| """ | |
| if self._loaded: | |
| return True | |
| self._high, self._medium = _load_lexicon() | |
| self._skel_high = _build_skeleton_index(self._high) | |
| self._skel_medium = _build_skeleton_index(self._medium) | |
| # Layer 2 ML opsional (classifier toksisitas pre-trained Indonesia). | |
| if self._use_ml: | |
| load_from = str(_ML_DIR) if _ML_DIR.is_dir() else _ML_MODEL | |
| try: | |
| from transformers import pipeline as _hf_pipeline # impor lazy | |
| logger.info("Memuat model toksisitas '%s'...", load_from) | |
| self._ml_pipe = _hf_pipeline( | |
| "text-classification", model=load_from, | |
| truncation=True, max_length=256, | |
| ) | |
| logger.info("Model toksisitas berhasil dimuat.") | |
| except Exception as exc: | |
| logger.warning("Gagal memuat model toksisitas: %s — hanya leksikon aktif.", exc) | |
| self._ml_pipe = None | |
| self._loaded = True | |
| return True | |
| def ml_active(self) -> bool: | |
| """True jika Layer 2 (model toksisitas) aktif.""" | |
| return self._ml_pipe is not None | |
| def download_ml_model(self) -> bool: | |
| """Unduh model toksisitas ke cache lokal untuk penggunaan offline.""" | |
| try: | |
| from huggingface_hub import snapshot_download | |
| _ML_DIR.mkdir(parents=True, exist_ok=True) | |
| snapshot_download(repo_id=_ML_MODEL, local_dir=str(_ML_DIR)) | |
| return True | |
| except Exception as exc: | |
| logger.error("Gagal mengunduh model toksisitas: %s", exc) | |
| return False | |
| def _check_ml(self, text: str) -> list[ProfanityFinding]: | |
| """Klasifikasi toksisitas pada keseluruhan teks (advice-only).""" | |
| try: | |
| with self._ml_lock: | |
| result = self._ml_pipe(text, truncation=True, max_length=256) | |
| except Exception as exc: | |
| logger.warning("Klasifikasi toksisitas ML gagal: %s", exc) | |
| return [] | |
| if isinstance(result, list): | |
| result = result[0] if result else {} | |
| label = str(result.get("label", "")).upper() | |
| score = float(result.get("score", 0.0)) | |
| # Hormati label negatif/aman walau mengandung kata "HATE" (mis. | |
| # "Non_HateSpeech", "NOT_OFFENSIVE", "NORMAL", "CLEAN", "LABEL_0"). | |
| is_safe = any(n in label for n in ("NON", "NOT", "NEG", "NORMAL", "CLEAN", "SAFE", "BUKAN")) | |
| toxic = (not is_safe) and ( | |
| label in _ML_TOXIC_LABELS or any(s in label for s in _ML_TOXIC_SUBSTR) | |
| ) | |
| if not toxic or score < _ML_THRESHOLD: | |
| return [] | |
| return [ProfanityFinding( | |
| word=text[:120] + ("..." if len(text) > 120 else ""), | |
| normalized="", | |
| start=0, end=min(len(text), 120), | |
| severity="MEDIUM", | |
| reason=("Teks ini terindikasi mengandung ujaran kasar atau toksik " | |
| f"(model ML, skor {score:.0%}). Tinjau dan perhalus bila perlu."), | |
| confidence=round(score, 3), | |
| layer="ml", | |
| )] | |
| def _classify_token( | |
| self, | |
| raw: str, | |
| language: str, | |
| ) -> tuple[SeverityLevel | None, str]: | |
| """Klasifikasikan satu token/hasil normalisasi sebagai profanity atau bukan.""" | |
| severity: SeverityLevel | None = None | |
| matched_as = "" | |
| # Tahap 1: cek semua varian normalisasi | |
| for variant in _all_variants(raw): | |
| if len(variant) < 2: | |
| continue | |
| if variant in self._high: | |
| return "HIGH", variant | |
| if variant in self._medium: | |
| return "MEDIUM", variant | |
| # Tahap 2: cek skeleton konsonan (untuk singkatan/typo vokal hilang) | |
| norm = _normalize(raw) | |
| if _is_known_common_word(norm, language): | |
| return None, "" | |
| skel = _consonant_skeleton(norm) | |
| if skel and len(skel) >= 3: | |
| hi = _match_skeleton(self._skel_high.get(skel), len(norm)) | |
| if hi is not None: | |
| severity = "HIGH" | |
| matched_as = hi | |
| else: | |
| me = _match_skeleton(self._skel_medium.get(skel), len(norm)) | |
| if me is not None: | |
| severity = "MEDIUM" | |
| matched_as = me | |
| # Tahap 2.5: validasi skeleton — jika kata input bisa dikoreksi ke kata | |
| # Indonesia yang umum (via SymSpell), batalkan skeleton match. | |
| # Contoh false positive: "speda" (informal "sepeda") → skeleton "spd" bisa | |
| # cocok lexicon eksternal, padahal SymSpell → "sepeda" (umum). | |
| if severity is not None and _WORDFREQ_OK: | |
| try: | |
| from word_quality.word_quality_detector import get_detector | |
| wq = get_detector(load=False) | |
| if wq.is_loaded: | |
| corrected = wq.correct_spelling(norm, language="id") | |
| if corrected and corrected != norm: | |
| id_freq = word_frequency(corrected, "id") | |
| # Threshold 500× WORDFREQ_MIN = 5e-6 — harus jelas kata umum | |
| if id_freq >= _WORDFREQ_MIN * 500: | |
| return None, "" | |
| except Exception: | |
| pass | |
| # Tahap 3: koreksi ejaan via SymSpell → cek apakah hasil koreksi adalah profanity. | |
| if severity is None: | |
| try: | |
| from word_quality.word_quality_detector import get_detector | |
| wq = get_detector(load=False) | |
| if wq.is_loaded: | |
| corrected = wq.correct_spelling(norm, language="id") | |
| if corrected: | |
| if corrected in self._high: | |
| severity = "HIGH"; matched_as = corrected | |
| elif corrected in self._medium: | |
| severity = "MEDIUM"; matched_as = corrected | |
| except Exception: | |
| pass | |
| return severity, matched_as | |
| def check(self, text: str, language: str = "id") -> list[ProfanityFinding]: | |
| """ | |
| Periksa teks dan kembalikan semua temuan kata kasar. | |
| Setiap token diperiksa dalam urutan: | |
| 1. Lookup varian normalisasi (leet + strip digit + collapse) | |
| 2. Skeleton konsonan untuk singkatan | |
| Args: | |
| text: Teks yang akan diperiksa. | |
| Returns: | |
| Daftar ProfanityFinding diurutkan berdasarkan posisi (start ascending). | |
| """ | |
| if not text.strip(): | |
| return [] | |
| if not self._loaded: | |
| self.load() | |
| findings: list[ProfanityFinding] = [] | |
| seen_spans: set[tuple[int, int]] = set() | |
| # Pre-pass: tangkap profanity yang dieja per huruf, mis. "a n j i n g". | |
| for m in _SPACED_LETTERS.finditer(text): | |
| raw = m.group() | |
| joined = re.sub(r"[^a-zA-Z0-9@$]", "", raw) | |
| severity, matched_as = self._classify_token(joined, language) | |
| if severity is None: | |
| continue | |
| reason = ( | |
| "Kata vulgar/seksual eksplisit yang tidak pantas dalam konteks pendidikan." | |
| if severity == "HIGH" | |
| else "Umpatan atau kata merendahkan yang tidak pantas dalam teks formal/pendidikan." | |
| ) | |
| findings.append(ProfanityFinding( | |
| word=raw, | |
| normalized=matched_as or _normalize(joined), | |
| start=m.start(), | |
| end=m.end(), | |
| severity=severity, | |
| reason=reason, | |
| confidence=0.90 if severity == "HIGH" else 0.80, | |
| )) | |
| seen_spans.add((m.start(), m.end())) | |
| for m in re.finditer(r"[a-zA-Z0-9@$]+", text): | |
| raw = m.group() | |
| start = m.start() | |
| end = m.end() | |
| span = (start, end) | |
| if any(s <= start < e or s < end <= e for s, e in seen_spans): | |
| continue | |
| severity, matched_as = self._classify_token(raw, language) | |
| if severity is None: | |
| continue | |
| seen_spans.add(span) | |
| reason = ( | |
| "Kata vulgar/seksual eksplisit yang tidak pantas dalam konteks pendidikan." | |
| if severity == "HIGH" | |
| else "Umpatan atau kata merendahkan yang tidak pantas dalam teks formal/pendidikan." | |
| ) | |
| findings.append(ProfanityFinding( | |
| word=raw, | |
| normalized=matched_as or _normalize(raw), | |
| start=start, | |
| end=end, | |
| severity=severity, | |
| reason=reason, | |
| confidence=0.95 if severity == "HIGH" else 0.85, | |
| )) | |
| # Layer 2: classifier toksisitas (opsional) | |
| # Hanya dijalankan bila leksikon tidak menemukan apa pun, untuk menangkap | |
| # ujaran kasar/toksik yang lolos leksikon tanpa menambah false positive | |
| # pada teks yang sudah jelas ditandai Layer 1. | |
| if not findings and self._ml_pipe is not None and len(text.split()) >= 3: | |
| findings.extend(self._check_ml(text)) | |
| return sorted(findings, key=lambda f: f.start) | |
| # Properties | |
| def is_loaded(self) -> bool: | |
| """True jika lexicon sudah dimuat.""" | |
| return self._loaded | |
| def lexicon_size(self) -> int: | |
| """Jumlah total kata dalam lexicon (medium ⊇ high).""" | |
| return len(self._medium) | |
| # Singleton | |
| _checker: ProfanityChecker | None = None | |
| def get_checker() -> ProfanityChecker: | |
| """ | |
| Kembalikan instance ProfanityChecker singleton (lazy-initialized). | |
| Instance yang sama digunakan ulang di seluruh aplikasi sehingga lexicon | |
| hanya dimuat sekali. | |
| """ | |
| global _checker | |
| if _checker is None: | |
| _checker = ProfanityChecker() | |
| _checker.load() | |
| return _checker | |
| # Demo CLI | |
| if __name__ == "__main__": | |
| import sys | |
| logging.basicConfig(level=logging.INFO, format="%(levelname)s %(message)s") | |
| SAMPLES = [ | |
| "Tolong jelaskan materi fotosintesis", | |
| "anjing lu goblok banget", | |
| "bangsat! laporan ini berantakan", | |
| "kontol memek itu kata vulgar", | |
| "saya tidak suka cara kerjanya", | |
| "bego banget si brengsek itu", | |
| "Pak Andi adalah guru yang baik", | |
| "4nj1ng lu anjng bangsat bngst", # leet + singkatan | |
| ] | |
| texts = sys.argv[1:] or SAMPLES | |
| chk = ProfanityChecker() | |
| chk.load() | |
| print(f"Lexicon: {chk.lexicon_size} kata\n{'-' * 60}") | |
| for text in texts: | |
| findings = chk.check(text) | |
| print(f"\n> {text}") | |
| if not findings: | |
| print(" (bersih)") | |
| for f in findings: | |
| bar = "#" * int(f.confidence * 10) + "." * (10 - int(f.confidence * 10)) | |
| print( | |
| f" [{f.severity:<6}] {f.word!r:<15} " | |
| f"(norm={f.normalized!r}) {bar} {f.confidence:.0%}" | |
| ) | |