OFPBadWord / src /profanity_detector.py
BladeSzaSza's picture
added dataset
589ab6a
"""
Profanity Detector
Simple keyword-based profanity detection using better-profanity library
Supports custom word lists, whitelists, and leetspeak variants
"""
from better_profanity import profanity
import logging
import os
from pathlib import Path
from typing import List, Dict, Optional
logger = logging.getLogger(__name__)
class ProfanityDetector:
"""Keyword-based profanity detector with customization support"""
def __init__(self, custom_words: Optional[List[str]] = None,
whitelist: Optional[List[str]] = None,
kaggle_dataset_path: Optional[str] = None,
languages: Optional[List[str]] = None):
"""
Initialize profanity detector with optional custom words
Args:
custom_words: List of additional bad words to detect
whitelist: List of words to exclude from detection (false positives)
kaggle_dataset_path: Path to Kaggle dataset folder
languages: List of language codes to load from Kaggle dataset
"""
# Load default word list first
profanity.load_censor_words(whitelist_words=whitelist or [])
logger.info("Loaded default profanity word list")
# Load Kaggle dataset if configured
kaggle_words = []
if kaggle_dataset_path and languages:
kaggle_words = self._load_kaggle_dataset(kaggle_dataset_path, languages)
if kaggle_words:
profanity.add_censor_words(kaggle_words)
logger.info(f"Added {len(kaggle_words)} words from Kaggle dataset ({', '.join(languages)})")
# Add custom words if provided (extends defaults, doesn't replace)
if custom_words:
profanity.add_censor_words(custom_words)
logger.info(f"Added {len(custom_words)} custom bad words")
self.whitelist = set(whitelist or [])
self.custom_words = set(custom_words or [])
self.kaggle_words = set(kaggle_words)
self.languages = languages or []
def is_profane(self, text: str) -> bool:
"""
Check if text contains profanity
Args:
text: Text to check
Returns:
True if profanity detected, False otherwise
"""
if not text or not text.strip():
return False
return profanity.contains_profanity(text)
def detect_violations(self, text: str) -> Optional[Dict]:
"""
Detect profanity and return detailed violation info
Args:
text: Text to analyze
Returns:
Dictionary with violation details if found, None otherwise
"""
if not text or not text.strip():
return None
if not self.is_profane(text):
return None
# Censor the text to identify violating words
censored = profanity.censor(text, '*')
# Extract censored words (basic implementation)
original_words = text.split()
censored_words = censored.split()
violations = []
for orig, cens in zip(original_words, censored_words):
if '*' in cens:
violations.append(orig)
return {
"detected": True,
"severity": self._calculate_severity(violations),
"violations": violations,
"censored_text": censored,
"violation_count": len(violations),
"original_text": text
}
def _calculate_severity(self, violations: List[str]) -> str:
"""
Calculate severity based on violation count and word types
Args:
violations: List of violating words
Returns:
Severity level: "none", "low", "medium", or "high"
"""
count = len(violations)
if count == 0:
return "none"
elif count == 1:
return "low"
elif count <= 3:
return "medium"
else:
return "high"
def add_words(self, words: List[str]):
"""
Add words to profanity list at runtime
Args:
words: List of words to add
"""
profanity.add_censor_words(words)
self.custom_words.update(words)
logger.info(f"Added {len(words)} words to profanity list")
def add_to_whitelist(self, words: List[str]):
"""
Add words to whitelist (won't be flagged)
Args:
words: List of words to whitelist
"""
self.whitelist.update(words)
logger.info(f"Added {len(words)} words to whitelist")
@staticmethod
def load_wordlist_from_file(filepath: str) -> List[str]:
"""
Load custom word list from text file (one word per line)
Args:
filepath: Path to word list file
Returns:
List of words
"""
try:
with open(filepath, 'r', encoding='utf-8') as f:
words = [line.strip() for line in f if line.strip() and not line.startswith('#')]
logger.info(f"Loaded {len(words)} words from {filepath}")
return words
except FileNotFoundError:
logger.warning(f"Word list file not found: {filepath}")
return []
except Exception as e:
logger.error(f"Error loading word list from {filepath}: {e}")
return []
def _load_kaggle_dataset(self, dataset_path: str, languages: List[str]) -> List[str]:
"""
Load bad words from Kaggle dataset
Args:
dataset_path: Path to Kaggle dataset folder
languages: List of language codes to load (e.g., ['en', 'es', 'fr'])
Use ['all'] to load all available languages
Returns:
List of bad words from specified languages
"""
words = []
dataset_dir = Path(dataset_path)
if not dataset_dir.exists():
logger.warning(f"Kaggle dataset path not found: {dataset_path}")
return words
# Get all available language files if 'all' is specified
if 'all' in languages:
available_files = [f.stem for f in dataset_dir.glob('*') if f.is_file()]
languages = available_files
logger.info(f"Loading all {len(languages)} available languages from Kaggle dataset")
# Load words from each language file
for lang in languages:
lang_file = dataset_dir / lang
if not lang_file.exists():
logger.warning(f"Language file not found: {lang_file}")
continue
try:
with open(lang_file, 'r', encoding='utf-8') as f:
lang_words = [
line.strip()
for line in f
if line.strip() and not line.startswith('#')
]
words.extend(lang_words)
logger.info(f"Loaded {len(lang_words)} words from language: {lang}")
except Exception as e:
logger.error(f"Error loading language file {lang}: {e}")
return words
def get_stats(self) -> Dict:
"""Get detector statistics"""
return {
"custom_words_count": len(self.custom_words),
"whitelist_count": len(self.whitelist),
"kaggle_words_count": len(self.kaggle_words),
"languages": self.languages,
"using_defaults": len(self.custom_words) == 0 and len(self.kaggle_words) == 0
}