File size: 2,547 Bytes
d8473b6
 
 
 
 
 
 
764e30e
d8473b6
 
764e30e
d8473b6
764e30e
 
d8473b6
764e30e
 
d8473b6
764e30e
 
d8473b6
764e30e
 
d8473b6
764e30e
 
d8473b6
764e30e
 
d8473b6
764e30e
 
d8473b6
764e30e
 
 
 
1915c66
d8473b6
764e30e
d8473b6
764e30e
d8473b6
 
764e30e
d8473b6
 
764e30e
d8473b6
764e30e
 
d8473b6
764e30e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1915c66
764e30e
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
import re
import logging

logger = logging.getLogger(__name__)


class RegexPIIRemover:
    """Remove PII using regex patterns"""
    
    def __init__(self):
        """Initialize PII removal patterns"""
        self.patterns = {
            # Social Security Numbers
            'ssn': re.compile(r'\b\d{3}-\d{2}-\d{4}\b'),
            
            # Phone numbers
            'phone': re.compile(r'\b\d{3}[-.]?\d{3}[-.]?\d{4}\b'),
            
            # Email addresses
            'email': re.compile(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b'),
            
            # Dates (MM/DD/YYYY, MM-DD-YYYY, etc.)
            'date': re.compile(r'\b\d{1,2}[/-]\d{1,2}[/-]\d{2,4}\b'),
            
            # Medical Record Numbers (MRN)
            'mrn': re.compile(r'\b(MRN|Medical Record Number)[:\s]+\w+\b', re.IGNORECASE),
            
            # ZIP codes
            'zip': re.compile(r'\b\d{5}(-\d{4})?\b'),
            
            # Names (simple pattern - captures "Patient: John Doe" or "Name: Jane Smith")
            'patient_name': re.compile(r'(Patient|Name)[:\s]+([A-Z][a-z]+\s[A-Z][a-z]+)', re.IGNORECASE),
            
            # Date of Birth
            'dob': re.compile(r'(DOB|Date of Birth)[:\s]+\d{1,2}[/-]\d{1,2}[/-]\d{2,4}', re.IGNORECASE),
        }
        
        logger.info(f"RegexPIIRemover initialized with {len(self.patterns)} patterns")
    
    def remove_pii(self, text: str) -> tuple[str, int]:
        """
        Remove PII from text
        
        Args:
            text: Input text
            
        Returns:
            tuple: (cleaned_text, count_of_pii_removed)
        """
        cleaned_text = text
        total_removed = 0
        
        for pii_type, pattern in self.patterns.items():
            matches = pattern.findall(cleaned_text)
            count = len(matches)
            
            if count > 0:
                logger.debug(f"Found {count} instances of {pii_type}")
                total_removed += count
                
                # Replace with redacted placeholder
                if pii_type == 'patient_name':
                    cleaned_text = pattern.sub(r'\1: [REDACTED]', cleaned_text)
                elif pii_type in ['dob', 'mrn']:
                    cleaned_text = pattern.sub(r'\1: [REDACTED]', cleaned_text)
                else:
                    cleaned_text = pattern.sub('[REDACTED]', cleaned_text)
        
        logger.info(f"Removed {total_removed} PII entities")
        
        return cleaned_text, total_removed