Spaces:
Building
Building
File size: 7,547 Bytes
c30b695 589ab6a c30b695 589ab6a c30b695 589ab6a c30b695 589ab6a c30b695 589ab6a c30b695 589ab6a c30b695 589ab6a c30b695 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 |
"""
Profanity Detector
Simple keyword-based profanity detection using better-profanity library
Supports custom word lists, whitelists, and leetspeak variants
"""
from better_profanity import profanity
import logging
import os
from pathlib import Path
from typing import List, Dict, Optional
logger = logging.getLogger(__name__)
class ProfanityDetector:
"""Keyword-based profanity detector with customization support"""
def __init__(self, custom_words: Optional[List[str]] = None,
whitelist: Optional[List[str]] = None,
kaggle_dataset_path: Optional[str] = None,
languages: Optional[List[str]] = None):
"""
Initialize profanity detector with optional custom words
Args:
custom_words: List of additional bad words to detect
whitelist: List of words to exclude from detection (false positives)
kaggle_dataset_path: Path to Kaggle dataset folder
languages: List of language codes to load from Kaggle dataset
"""
# Load default word list first
profanity.load_censor_words(whitelist_words=whitelist or [])
logger.info("Loaded default profanity word list")
# Load Kaggle dataset if configured
kaggle_words = []
if kaggle_dataset_path and languages:
kaggle_words = self._load_kaggle_dataset(kaggle_dataset_path, languages)
if kaggle_words:
profanity.add_censor_words(kaggle_words)
logger.info(f"Added {len(kaggle_words)} words from Kaggle dataset ({', '.join(languages)})")
# Add custom words if provided (extends defaults, doesn't replace)
if custom_words:
profanity.add_censor_words(custom_words)
logger.info(f"Added {len(custom_words)} custom bad words")
self.whitelist = set(whitelist or [])
self.custom_words = set(custom_words or [])
self.kaggle_words = set(kaggle_words)
self.languages = languages or []
def is_profane(self, text: str) -> bool:
"""
Check if text contains profanity
Args:
text: Text to check
Returns:
True if profanity detected, False otherwise
"""
if not text or not text.strip():
return False
return profanity.contains_profanity(text)
def detect_violations(self, text: str) -> Optional[Dict]:
"""
Detect profanity and return detailed violation info
Args:
text: Text to analyze
Returns:
Dictionary with violation details if found, None otherwise
"""
if not text or not text.strip():
return None
if not self.is_profane(text):
return None
# Censor the text to identify violating words
censored = profanity.censor(text, '*')
# Extract censored words (basic implementation)
original_words = text.split()
censored_words = censored.split()
violations = []
for orig, cens in zip(original_words, censored_words):
if '*' in cens:
violations.append(orig)
return {
"detected": True,
"severity": self._calculate_severity(violations),
"violations": violations,
"censored_text": censored,
"violation_count": len(violations),
"original_text": text
}
def _calculate_severity(self, violations: List[str]) -> str:
"""
Calculate severity based on violation count and word types
Args:
violations: List of violating words
Returns:
Severity level: "none", "low", "medium", or "high"
"""
count = len(violations)
if count == 0:
return "none"
elif count == 1:
return "low"
elif count <= 3:
return "medium"
else:
return "high"
def add_words(self, words: List[str]):
"""
Add words to profanity list at runtime
Args:
words: List of words to add
"""
profanity.add_censor_words(words)
self.custom_words.update(words)
logger.info(f"Added {len(words)} words to profanity list")
def add_to_whitelist(self, words: List[str]):
"""
Add words to whitelist (won't be flagged)
Args:
words: List of words to whitelist
"""
self.whitelist.update(words)
logger.info(f"Added {len(words)} words to whitelist")
@staticmethod
def load_wordlist_from_file(filepath: str) -> List[str]:
"""
Load custom word list from text file (one word per line)
Args:
filepath: Path to word list file
Returns:
List of words
"""
try:
with open(filepath, 'r', encoding='utf-8') as f:
words = [line.strip() for line in f if line.strip() and not line.startswith('#')]
logger.info(f"Loaded {len(words)} words from {filepath}")
return words
except FileNotFoundError:
logger.warning(f"Word list file not found: {filepath}")
return []
except Exception as e:
logger.error(f"Error loading word list from {filepath}: {e}")
return []
def _load_kaggle_dataset(self, dataset_path: str, languages: List[str]) -> List[str]:
"""
Load bad words from Kaggle dataset
Args:
dataset_path: Path to Kaggle dataset folder
languages: List of language codes to load (e.g., ['en', 'es', 'fr'])
Use ['all'] to load all available languages
Returns:
List of bad words from specified languages
"""
words = []
dataset_dir = Path(dataset_path)
if not dataset_dir.exists():
logger.warning(f"Kaggle dataset path not found: {dataset_path}")
return words
# Get all available language files if 'all' is specified
if 'all' in languages:
available_files = [f.stem for f in dataset_dir.glob('*') if f.is_file()]
languages = available_files
logger.info(f"Loading all {len(languages)} available languages from Kaggle dataset")
# Load words from each language file
for lang in languages:
lang_file = dataset_dir / lang
if not lang_file.exists():
logger.warning(f"Language file not found: {lang_file}")
continue
try:
with open(lang_file, 'r', encoding='utf-8') as f:
lang_words = [
line.strip()
for line in f
if line.strip() and not line.startswith('#')
]
words.extend(lang_words)
logger.info(f"Loaded {len(lang_words)} words from language: {lang}")
except Exception as e:
logger.error(f"Error loading language file {lang}: {e}")
return words
def get_stats(self) -> Dict:
"""Get detector statistics"""
return {
"custom_words_count": len(self.custom_words),
"whitelist_count": len(self.whitelist),
"kaggle_words_count": len(self.kaggle_words),
"languages": self.languages,
"using_defaults": len(self.custom_words) == 0 and len(self.kaggle_words) == 0
}
|