from typing import Dict, List, Optional import re class Delexicalizer: def __init__(self): self._placeholder_map: Dict[str, str] = {} self._reverse_map: Dict[str, str] = {} self._counter = 0 # Basic profanity list for detection self._profanity_patterns = [ 'fuck', 'shit', 'damn', 'ass', 'bitch', 'bastard', 'crap', 'hell', 'piss', 'dick', 'cock', 'pussy' ] def delexicalize(self, text: str) -> str: """ Replace profane words with placeholders. Args: text: Input text containing potential profanity Returns: str: Text with profanity replaced by placeholders """ result = text for pattern in self._profanity_patterns: # Find all occurrences (case-insensitive) matches = list(re.finditer(r'\b' + pattern + r'\b', result, re.IGNORECASE)) for match in reversed(matches): # Reverse to maintain positions original_word = match.group() placeholder = self._create_placeholder(original_word) result = result[:match.start()] + placeholder + result[match.end():] return result def relexicalize(self, text: str) -> str: """ Restore original words from placeholders. Args: text: Text with placeholders Returns: str: Original text with placeholders replaced """ for placeholder, original in self._placeholder_map.items(): text = text.replace(placeholder, original) return text def _create_placeholder(self, word: str) -> str: """Create a unique placeholder for a word.""" self._counter += 1 placeholder = f"" self._placeholder_map[placeholder] = word self._reverse_map[word.lower()] = placeholder return placeholder def reset(self): """Reset the delexicalizer state.""" self._placeholder_map.clear() self._reverse_map.clear() self._counter = 0