icd-cpt-coding-api-backend / src /services /regex_pii_remover.py
Distopia22's picture
Fixing model halucination
1915c66
import re
import logging
logger = logging.getLogger(__name__)
class RegexPIIRemover:
"""Remove PII using regex patterns"""
def __init__(self):
"""Initialize PII removal patterns"""
self.patterns = {
# Social Security Numbers
'ssn': re.compile(r'\b\d{3}-\d{2}-\d{4}\b'),
# Phone numbers
'phone': re.compile(r'\b\d{3}[-.]?\d{3}[-.]?\d{4}\b'),
# Email addresses
'email': re.compile(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b'),
# Dates (MM/DD/YYYY, MM-DD-YYYY, etc.)
'date': re.compile(r'\b\d{1,2}[/-]\d{1,2}[/-]\d{2,4}\b'),
# Medical Record Numbers (MRN)
'mrn': re.compile(r'\b(MRN|Medical Record Number)[:\s]+\w+\b', re.IGNORECASE),
# ZIP codes
'zip': re.compile(r'\b\d{5}(-\d{4})?\b'),
# Names (simple pattern - captures "Patient: John Doe" or "Name: Jane Smith")
'patient_name': re.compile(r'(Patient|Name)[:\s]+([A-Z][a-z]+\s[A-Z][a-z]+)', re.IGNORECASE),
# Date of Birth
'dob': re.compile(r'(DOB|Date of Birth)[:\s]+\d{1,2}[/-]\d{1,2}[/-]\d{2,4}', re.IGNORECASE),
}
logger.info(f"RegexPIIRemover initialized with {len(self.patterns)} patterns")
def remove_pii(self, text: str) -> tuple[str, int]:
"""
Remove PII from text
Args:
text: Input text
Returns:
tuple: (cleaned_text, count_of_pii_removed)
"""
cleaned_text = text
total_removed = 0
for pii_type, pattern in self.patterns.items():
matches = pattern.findall(cleaned_text)
count = len(matches)
if count > 0:
logger.debug(f"Found {count} instances of {pii_type}")
total_removed += count
# Replace with redacted placeholder
if pii_type == 'patient_name':
cleaned_text = pattern.sub(r'\1: [REDACTED]', cleaned_text)
elif pii_type in ['dob', 'mrn']:
cleaned_text = pattern.sub(r'\1: [REDACTED]', cleaned_text)
else:
cleaned_text = pattern.sub('[REDACTED]', cleaned_text)
logger.info(f"Removed {total_removed} PII entities")
return cleaned_text, total_removed