Spaces:
Sleeping
Sleeping
| """ | |
| Profanity Detector | |
| Simple keyword-based profanity detection using better-profanity library | |
| Supports custom word lists, whitelists, and leetspeak variants | |
| """ | |
| from better_profanity import profanity | |
| import logging | |
| import os | |
| from pathlib import Path | |
| from typing import List, Dict, Optional | |
| logger = logging.getLogger(__name__) | |
| class ProfanityDetector: | |
| """Keyword-based profanity detector with customization support""" | |
| def __init__(self, custom_words: Optional[List[str]] = None, | |
| whitelist: Optional[List[str]] = None, | |
| kaggle_dataset_path: Optional[str] = None, | |
| languages: Optional[List[str]] = None): | |
| """ | |
| Initialize profanity detector with optional custom words | |
| Args: | |
| custom_words: List of additional bad words to detect | |
| whitelist: List of words to exclude from detection (false positives) | |
| kaggle_dataset_path: Path to Kaggle dataset folder | |
| languages: List of language codes to load from Kaggle dataset | |
| """ | |
| # Load default word list first | |
| profanity.load_censor_words(whitelist_words=whitelist or []) | |
| logger.info("Loaded default profanity word list") | |
| # Load Kaggle dataset if configured | |
| kaggle_words = [] | |
| if kaggle_dataset_path and languages: | |
| kaggle_words = self._load_kaggle_dataset(kaggle_dataset_path, languages) | |
| if kaggle_words: | |
| profanity.add_censor_words(kaggle_words) | |
| logger.info(f"Added {len(kaggle_words)} words from Kaggle dataset ({', '.join(languages)})") | |
| # Add custom words if provided (extends defaults, doesn't replace) | |
| if custom_words: | |
| profanity.add_censor_words(custom_words) | |
| logger.info(f"Added {len(custom_words)} custom bad words") | |
| self.whitelist = set(whitelist or []) | |
| self.custom_words = set(custom_words or []) | |
| self.kaggle_words = set(kaggle_words) | |
| self.languages = languages or [] | |
| def is_profane(self, text: str) -> bool: | |
| """ | |
| Check if text contains profanity | |
| Args: | |
| text: Text to check | |
| Returns: | |
| True if profanity detected, False otherwise | |
| """ | |
| if not text or not text.strip(): | |
| return False | |
| return profanity.contains_profanity(text) | |
| def detect_violations(self, text: str) -> Optional[Dict]: | |
| """ | |
| Detect profanity and return detailed violation info | |
| Args: | |
| text: Text to analyze | |
| Returns: | |
| Dictionary with violation details if found, None otherwise | |
| """ | |
| if not text or not text.strip(): | |
| return None | |
| if not self.is_profane(text): | |
| return None | |
| # Censor the text to identify violating words | |
| censored = profanity.censor(text, '*') | |
| # Extract censored words (basic implementation) | |
| original_words = text.split() | |
| censored_words = censored.split() | |
| violations = [] | |
| for orig, cens in zip(original_words, censored_words): | |
| if '*' in cens: | |
| violations.append(orig) | |
| return { | |
| "detected": True, | |
| "severity": self._calculate_severity(violations), | |
| "violations": violations, | |
| "censored_text": censored, | |
| "violation_count": len(violations), | |
| "original_text": text | |
| } | |
| def _calculate_severity(self, violations: List[str]) -> str: | |
| """ | |
| Calculate severity based on violation count and word types | |
| Args: | |
| violations: List of violating words | |
| Returns: | |
| Severity level: "none", "low", "medium", or "high" | |
| """ | |
| count = len(violations) | |
| if count == 0: | |
| return "none" | |
| elif count == 1: | |
| return "low" | |
| elif count <= 3: | |
| return "medium" | |
| else: | |
| return "high" | |
| def add_words(self, words: List[str]): | |
| """ | |
| Add words to profanity list at runtime | |
| Args: | |
| words: List of words to add | |
| """ | |
| profanity.add_censor_words(words) | |
| self.custom_words.update(words) | |
| logger.info(f"Added {len(words)} words to profanity list") | |
| def add_to_whitelist(self, words: List[str]): | |
| """ | |
| Add words to whitelist (won't be flagged) | |
| Args: | |
| words: List of words to whitelist | |
| """ | |
| self.whitelist.update(words) | |
| logger.info(f"Added {len(words)} words to whitelist") | |
| def load_wordlist_from_file(filepath: str) -> List[str]: | |
| """ | |
| Load custom word list from text file (one word per line) | |
| Args: | |
| filepath: Path to word list file | |
| Returns: | |
| List of words | |
| """ | |
| try: | |
| with open(filepath, 'r', encoding='utf-8') as f: | |
| words = [line.strip() for line in f if line.strip() and not line.startswith('#')] | |
| logger.info(f"Loaded {len(words)} words from {filepath}") | |
| return words | |
| except FileNotFoundError: | |
| logger.warning(f"Word list file not found: {filepath}") | |
| return [] | |
| except Exception as e: | |
| logger.error(f"Error loading word list from {filepath}: {e}") | |
| return [] | |
| def _load_kaggle_dataset(self, dataset_path: str, languages: List[str]) -> List[str]: | |
| """ | |
| Load bad words from Kaggle dataset | |
| Args: | |
| dataset_path: Path to Kaggle dataset folder | |
| languages: List of language codes to load (e.g., ['en', 'es', 'fr']) | |
| Use ['all'] to load all available languages | |
| Returns: | |
| List of bad words from specified languages | |
| """ | |
| words = [] | |
| dataset_dir = Path(dataset_path) | |
| if not dataset_dir.exists(): | |
| logger.warning(f"Kaggle dataset path not found: {dataset_path}") | |
| return words | |
| # Get all available language files if 'all' is specified | |
| if 'all' in languages: | |
| available_files = [f.stem for f in dataset_dir.glob('*') if f.is_file()] | |
| languages = available_files | |
| logger.info(f"Loading all {len(languages)} available languages from Kaggle dataset") | |
| # Load words from each language file | |
| for lang in languages: | |
| lang_file = dataset_dir / lang | |
| if not lang_file.exists(): | |
| logger.warning(f"Language file not found: {lang_file}") | |
| continue | |
| try: | |
| with open(lang_file, 'r', encoding='utf-8') as f: | |
| lang_words = [ | |
| line.strip() | |
| for line in f | |
| if line.strip() and not line.startswith('#') | |
| ] | |
| words.extend(lang_words) | |
| logger.info(f"Loaded {len(lang_words)} words from language: {lang}") | |
| except Exception as e: | |
| logger.error(f"Error loading language file {lang}: {e}") | |
| return words | |
| def get_stats(self) -> Dict: | |
| """Get detector statistics""" | |
| return { | |
| "custom_words_count": len(self.custom_words), | |
| "whitelist_count": len(self.whitelist), | |
| "kaggle_words_count": len(self.kaggle_words), | |
| "languages": self.languages, | |
| "using_defaults": len(self.custom_words) == 0 and len(self.kaggle_words) == 0 | |
| } | |