import re import logging logger = logging.getLogger(__name__) class RegexPIIRemover: """Remove PII using regex patterns""" def __init__(self): """Initialize PII removal patterns""" self.patterns = { # Social Security Numbers 'ssn': re.compile(r'\b\d{3}-\d{2}-\d{4}\b'), # Phone numbers 'phone': re.compile(r'\b\d{3}[-.]?\d{3}[-.]?\d{4}\b'), # Email addresses 'email': re.compile(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b'), # Dates (MM/DD/YYYY, MM-DD-YYYY, etc.) 'date': re.compile(r'\b\d{1,2}[/-]\d{1,2}[/-]\d{2,4}\b'), # Medical Record Numbers (MRN) 'mrn': re.compile(r'\b(MRN|Medical Record Number)[:\s]+\w+\b', re.IGNORECASE), # ZIP codes 'zip': re.compile(r'\b\d{5}(-\d{4})?\b'), # Names (simple pattern - captures "Patient: John Doe" or "Name: Jane Smith") 'patient_name': re.compile(r'(Patient|Name)[:\s]+([A-Z][a-z]+\s[A-Z][a-z]+)', re.IGNORECASE), # Date of Birth 'dob': re.compile(r'(DOB|Date of Birth)[:\s]+\d{1,2}[/-]\d{1,2}[/-]\d{2,4}', re.IGNORECASE), } logger.info(f"RegexPIIRemover initialized with {len(self.patterns)} patterns") def remove_pii(self, text: str) -> tuple[str, int]: """ Remove PII from text Args: text: Input text Returns: tuple: (cleaned_text, count_of_pii_removed) """ cleaned_text = text total_removed = 0 for pii_type, pattern in self.patterns.items(): matches = pattern.findall(cleaned_text) count = len(matches) if count > 0: logger.debug(f"Found {count} instances of {pii_type}") total_removed += count # Replace with redacted placeholder if pii_type == 'patient_name': cleaned_text = pattern.sub(r'\1: [REDACTED]', cleaned_text) elif pii_type in ['dob', 'mrn']: cleaned_text = pattern.sub(r'\1: [REDACTED]', cleaned_text) else: cleaned_text = pattern.sub('[REDACTED]', cleaned_text) logger.info(f"Removed {total_removed} PII entities") return cleaned_text, total_removed