""" Profanity Detector Simple keyword-based profanity detection using better-profanity library Supports custom word lists, whitelists, and leetspeak variants """ from better_profanity import profanity import logging import os from pathlib import Path from typing import List, Dict, Optional logger = logging.getLogger(__name__) class ProfanityDetector: """Keyword-based profanity detector with customization support""" def __init__(self, custom_words: Optional[List[str]] = None, whitelist: Optional[List[str]] = None, kaggle_dataset_path: Optional[str] = None, languages: Optional[List[str]] = None): """ Initialize profanity detector with optional custom words Args: custom_words: List of additional bad words to detect whitelist: List of words to exclude from detection (false positives) kaggle_dataset_path: Path to Kaggle dataset folder languages: List of language codes to load from Kaggle dataset """ # Load default word list first profanity.load_censor_words(whitelist_words=whitelist or []) logger.info("Loaded default profanity word list") # Load Kaggle dataset if configured kaggle_words = [] if kaggle_dataset_path and languages: kaggle_words = self._load_kaggle_dataset(kaggle_dataset_path, languages) if kaggle_words: profanity.add_censor_words(kaggle_words) logger.info(f"Added {len(kaggle_words)} words from Kaggle dataset ({', '.join(languages)})") # Add custom words if provided (extends defaults, doesn't replace) if custom_words: profanity.add_censor_words(custom_words) logger.info(f"Added {len(custom_words)} custom bad words") self.whitelist = set(whitelist or []) self.custom_words = set(custom_words or []) self.kaggle_words = set(kaggle_words) self.languages = languages or [] def is_profane(self, text: str) -> bool: """ Check if text contains profanity Args: text: Text to check Returns: True if profanity detected, False otherwise """ if not text or not text.strip(): return False return profanity.contains_profanity(text) def detect_violations(self, text: str) -> Optional[Dict]: """ Detect profanity and return detailed violation info Args: text: Text to analyze Returns: Dictionary with violation details if found, None otherwise """ if not text or not text.strip(): return None if not self.is_profane(text): return None # Censor the text to identify violating words censored = profanity.censor(text, '*') # Extract censored words (basic implementation) original_words = text.split() censored_words = censored.split() violations = [] for orig, cens in zip(original_words, censored_words): if '*' in cens: violations.append(orig) return { "detected": True, "severity": self._calculate_severity(violations), "violations": violations, "censored_text": censored, "violation_count": len(violations), "original_text": text } def _calculate_severity(self, violations: List[str]) -> str: """ Calculate severity based on violation count and word types Args: violations: List of violating words Returns: Severity level: "none", "low", "medium", or "high" """ count = len(violations) if count == 0: return "none" elif count == 1: return "low" elif count <= 3: return "medium" else: return "high" def add_words(self, words: List[str]): """ Add words to profanity list at runtime Args: words: List of words to add """ profanity.add_censor_words(words) self.custom_words.update(words) logger.info(f"Added {len(words)} words to profanity list") def add_to_whitelist(self, words: List[str]): """ Add words to whitelist (won't be flagged) Args: words: List of words to whitelist """ self.whitelist.update(words) logger.info(f"Added {len(words)} words to whitelist") @staticmethod def load_wordlist_from_file(filepath: str) -> List[str]: """ Load custom word list from text file (one word per line) Args: filepath: Path to word list file Returns: List of words """ try: with open(filepath, 'r', encoding='utf-8') as f: words = [line.strip() for line in f if line.strip() and not line.startswith('#')] logger.info(f"Loaded {len(words)} words from {filepath}") return words except FileNotFoundError: logger.warning(f"Word list file not found: {filepath}") return [] except Exception as e: logger.error(f"Error loading word list from {filepath}: {e}") return [] def _load_kaggle_dataset(self, dataset_path: str, languages: List[str]) -> List[str]: """ Load bad words from Kaggle dataset Args: dataset_path: Path to Kaggle dataset folder languages: List of language codes to load (e.g., ['en', 'es', 'fr']) Use ['all'] to load all available languages Returns: List of bad words from specified languages """ words = [] dataset_dir = Path(dataset_path) if not dataset_dir.exists(): logger.warning(f"Kaggle dataset path not found: {dataset_path}") return words # Get all available language files if 'all' is specified if 'all' in languages: available_files = [f.stem for f in dataset_dir.glob('*') if f.is_file()] languages = available_files logger.info(f"Loading all {len(languages)} available languages from Kaggle dataset") # Load words from each language file for lang in languages: lang_file = dataset_dir / lang if not lang_file.exists(): logger.warning(f"Language file not found: {lang_file}") continue try: with open(lang_file, 'r', encoding='utf-8') as f: lang_words = [ line.strip() for line in f if line.strip() and not line.startswith('#') ] words.extend(lang_words) logger.info(f"Loaded {len(lang_words)} words from language: {lang}") except Exception as e: logger.error(f"Error loading language file {lang}: {e}") return words def get_stats(self) -> Dict: """Get detector statistics""" return { "custom_words_count": len(self.custom_words), "whitelist_count": len(self.whitelist), "kaggle_words_count": len(self.kaggle_words), "languages": self.languages, "using_defaults": len(self.custom_words) == 0 and len(self.kaggle_words) == 0 }