| | from typing import Dict, List, Optional |
| | import re |
| |
|
| | class Delexicalizer: |
| | def __init__(self): |
| | self._placeholder_map: Dict[str, str] = {} |
| | self._reverse_map: Dict[str, str] = {} |
| | self._counter = 0 |
| | |
| | self._profanity_patterns = [ |
| | 'fuck', 'shit', 'damn', 'ass', 'bitch', 'bastard', |
| | 'crap', 'hell', 'piss', 'dick', 'cock', 'pussy' |
| | ] |
| | |
| | def delexicalize(self, text: str) -> str: |
| | """ |
| | Replace profane words with placeholders. |
| | |
| | Args: |
| | text: Input text containing potential profanity |
| | |
| | Returns: |
| | str: Text with profanity replaced by placeholders |
| | """ |
| | result = text |
| | for pattern in self._profanity_patterns: |
| | |
| | matches = list(re.finditer(r'\b' + pattern + r'\b', result, re.IGNORECASE)) |
| | for match in reversed(matches): |
| | original_word = match.group() |
| | placeholder = self._create_placeholder(original_word) |
| | result = result[:match.start()] + placeholder + result[match.end():] |
| | return result |
| | |
| | def relexicalize(self, text: str) -> str: |
| | """ |
| | Restore original words from placeholders. |
| | |
| | Args: |
| | text: Text with placeholders |
| | |
| | Returns: |
| | str: Original text with placeholders replaced |
| | """ |
| | for placeholder, original in self._placeholder_map.items(): |
| | text = text.replace(placeholder, original) |
| | return text |
| | |
| | def _create_placeholder(self, word: str) -> str: |
| | """Create a unique placeholder for a word.""" |
| | self._counter += 1 |
| | placeholder = f"<PROFANITY_{self._counter}>" |
| | self._placeholder_map[placeholder] = word |
| | self._reverse_map[word.lower()] = placeholder |
| | return placeholder |
| | |
| | def reset(self): |
| | """Reset the delexicalizer state.""" |
| | self._placeholder_map.clear() |
| | self._reverse_map.clear() |
| | self._counter = 0 |