File size: 2,547 Bytes
d8473b6 764e30e d8473b6 764e30e d8473b6 764e30e d8473b6 764e30e d8473b6 764e30e d8473b6 764e30e d8473b6 764e30e d8473b6 764e30e d8473b6 764e30e d8473b6 764e30e 1915c66 d8473b6 764e30e d8473b6 764e30e d8473b6 764e30e d8473b6 764e30e d8473b6 764e30e d8473b6 764e30e 1915c66 764e30e |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 |
import re
import logging
logger = logging.getLogger(__name__)
class RegexPIIRemover:
"""Remove PII using regex patterns"""
def __init__(self):
"""Initialize PII removal patterns"""
self.patterns = {
# Social Security Numbers
'ssn': re.compile(r'\b\d{3}-\d{2}-\d{4}\b'),
# Phone numbers
'phone': re.compile(r'\b\d{3}[-.]?\d{3}[-.]?\d{4}\b'),
# Email addresses
'email': re.compile(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b'),
# Dates (MM/DD/YYYY, MM-DD-YYYY, etc.)
'date': re.compile(r'\b\d{1,2}[/-]\d{1,2}[/-]\d{2,4}\b'),
# Medical Record Numbers (MRN)
'mrn': re.compile(r'\b(MRN|Medical Record Number)[:\s]+\w+\b', re.IGNORECASE),
# ZIP codes
'zip': re.compile(r'\b\d{5}(-\d{4})?\b'),
# Names (simple pattern - captures "Patient: John Doe" or "Name: Jane Smith")
'patient_name': re.compile(r'(Patient|Name)[:\s]+([A-Z][a-z]+\s[A-Z][a-z]+)', re.IGNORECASE),
# Date of Birth
'dob': re.compile(r'(DOB|Date of Birth)[:\s]+\d{1,2}[/-]\d{1,2}[/-]\d{2,4}', re.IGNORECASE),
}
logger.info(f"RegexPIIRemover initialized with {len(self.patterns)} patterns")
def remove_pii(self, text: str) -> tuple[str, int]:
"""
Remove PII from text
Args:
text: Input text
Returns:
tuple: (cleaned_text, count_of_pii_removed)
"""
cleaned_text = text
total_removed = 0
for pii_type, pattern in self.patterns.items():
matches = pattern.findall(cleaned_text)
count = len(matches)
if count > 0:
logger.debug(f"Found {count} instances of {pii_type}")
total_removed += count
# Replace with redacted placeholder
if pii_type == 'patient_name':
cleaned_text = pattern.sub(r'\1: [REDACTED]', cleaned_text)
elif pii_type in ['dob', 'mrn']:
cleaned_text = pattern.sub(r'\1: [REDACTED]', cleaned_text)
else:
cleaned_text = pattern.sub('[REDACTED]', cleaned_text)
logger.info(f"Removed {total_removed} PII entities")
return cleaned_text, total_removed |