prof-demo / src /core /delexicalizer.py
sbicy's picture
Upload 17 files
deff797 verified
from typing import Dict, List, Optional
import re
class Delexicalizer:
def __init__(self):
self._placeholder_map: Dict[str, str] = {}
self._reverse_map: Dict[str, str] = {}
self._counter = 0
# Basic profanity list for detection
self._profanity_patterns = [
'fuck', 'shit', 'damn', 'ass', 'bitch', 'bastard',
'crap', 'hell', 'piss', 'dick', 'cock', 'pussy'
]
def delexicalize(self, text: str) -> str:
"""
Replace profane words with placeholders.
Args:
text: Input text containing potential profanity
Returns:
str: Text with profanity replaced by placeholders
"""
result = text
for pattern in self._profanity_patterns:
# Find all occurrences (case-insensitive)
matches = list(re.finditer(r'\b' + pattern + r'\b', result, re.IGNORECASE))
for match in reversed(matches): # Reverse to maintain positions
original_word = match.group()
placeholder = self._create_placeholder(original_word)
result = result[:match.start()] + placeholder + result[match.end():]
return result
def relexicalize(self, text: str) -> str:
"""
Restore original words from placeholders.
Args:
text: Text with placeholders
Returns:
str: Original text with placeholders replaced
"""
for placeholder, original in self._placeholder_map.items():
text = text.replace(placeholder, original)
return text
def _create_placeholder(self, word: str) -> str:
"""Create a unique placeholder for a word."""
self._counter += 1
placeholder = f"<PROFANITY_{self._counter}>"
self._placeholder_map[placeholder] = word
self._reverse_map[word.lower()] = placeholder
return placeholder
def reset(self):
"""Reset the delexicalizer state."""
self._placeholder_map.clear()
self._reverse_map.clear()
self._counter = 0