|
|
import re |
|
|
import logging |
|
|
|
|
|
logger = logging.getLogger(__name__) |
|
|
|
|
|
|
|
|
class RegexPIIRemover: |
|
|
"""Remove PII using regex patterns""" |
|
|
|
|
|
def __init__(self): |
|
|
"""Initialize PII removal patterns""" |
|
|
self.patterns = { |
|
|
|
|
|
'ssn': re.compile(r'\b\d{3}-\d{2}-\d{4}\b'), |
|
|
|
|
|
|
|
|
'phone': re.compile(r'\b\d{3}[-.]?\d{3}[-.]?\d{4}\b'), |
|
|
|
|
|
|
|
|
'email': re.compile(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b'), |
|
|
|
|
|
|
|
|
'date': re.compile(r'\b\d{1,2}[/-]\d{1,2}[/-]\d{2,4}\b'), |
|
|
|
|
|
|
|
|
'mrn': re.compile(r'\b(MRN|Medical Record Number)[:\s]+\w+\b', re.IGNORECASE), |
|
|
|
|
|
|
|
|
'zip': re.compile(r'\b\d{5}(-\d{4})?\b'), |
|
|
|
|
|
|
|
|
'patient_name': re.compile(r'(Patient|Name)[:\s]+([A-Z][a-z]+\s[A-Z][a-z]+)', re.IGNORECASE), |
|
|
|
|
|
|
|
|
'dob': re.compile(r'(DOB|Date of Birth)[:\s]+\d{1,2}[/-]\d{1,2}[/-]\d{2,4}', re.IGNORECASE), |
|
|
} |
|
|
|
|
|
logger.info(f"RegexPIIRemover initialized with {len(self.patterns)} patterns") |
|
|
|
|
|
def remove_pii(self, text: str) -> tuple[str, int]: |
|
|
""" |
|
|
Remove PII from text |
|
|
|
|
|
Args: |
|
|
text: Input text |
|
|
|
|
|
Returns: |
|
|
tuple: (cleaned_text, count_of_pii_removed) |
|
|
""" |
|
|
cleaned_text = text |
|
|
total_removed = 0 |
|
|
|
|
|
for pii_type, pattern in self.patterns.items(): |
|
|
matches = pattern.findall(cleaned_text) |
|
|
count = len(matches) |
|
|
|
|
|
if count > 0: |
|
|
logger.debug(f"Found {count} instances of {pii_type}") |
|
|
total_removed += count |
|
|
|
|
|
|
|
|
if pii_type == 'patient_name': |
|
|
cleaned_text = pattern.sub(r'\1: [REDACTED]', cleaned_text) |
|
|
elif pii_type in ['dob', 'mrn']: |
|
|
cleaned_text = pattern.sub(r'\1: [REDACTED]', cleaned_text) |
|
|
else: |
|
|
cleaned_text = pattern.sub('[REDACTED]', cleaned_text) |
|
|
|
|
|
logger.info(f"Removed {total_removed} PII entities") |
|
|
|
|
|
return cleaned_text, total_removed |