import re from typing import List, Dict, Tuple from datetime import datetime class PIIDetector: """ Class for detecting and masking Personally Identifiable Information (PII) in text. Uses regular expressions and pattern matching to identify PII entities. """ def __init__(self): # Compile regex patterns for different PII types self.patterns = { "full_name": re.compile(r'\b([A-Z][a-z]+(\s[A-Z][a-z]+)+)\b'), "email": re.compile(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b'), "phone_number": re.compile(r'(\+?\d{1,3}[-.\s]?)?\(?\d{3}\)?[-.\s]?\d{3}[-.\s]?\d{4}\b'), "dob": re.compile(r'\b(\d{1,2}[-/]\d{1,2}[-/]\d{2,4}|(Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[a-z]* \d{1,2}, \d{4})\b'), "aadhar_num": re.compile(r'\b\d{4}[ -]?\d{4}[ -]?\d{4}\b'), "credit_debit_no": re.compile(r'\b(?:\d[ -]*?){13,16}\b'), "cvv_no": re.compile(r'\b\d{3,4}\b'), "expiry_no": re.compile(r'\b(0[1-9]|1[0-2])[-/]\d{2}\b') } def detect_pii(self, text: str) -> List[Dict]: """ Detect all PII entities in the given text. Args: text: Input text to scan for PII Returns: List of dictionaries containing PII entities with their positions and types """ entities = [] for entity_type, pattern in self.patterns.items(): for match in pattern.finditer(text): start, end = match.span() entity_value = match.group() # Additional validation for specific entity types if entity_type == "credit_debit_no" and not self._validate_luhn(entity_value): continue if entity_type == "dob" and not self._validate_date(entity_value): continue entities.append({ "position": [start, end], "classification": entity_type, "entity": entity_value }) # Sort entities by start position to handle masking in order entities.sort(key=lambda x: x["position"][0]) return entities def mask_pii(self, text: str, entities: List[Dict]) -> Tuple[str, List[Dict]]: """ Mask detected PII entities in the text. Args: text: Original text containing PII entities: List of detected PII entities Returns: Tuple of (masked_text, list_of_masked_entities) """ masked_text = text offset = 0 masked_entities = [] for entity in entities: start, end = entity["position"] entity_type = entity["classification"] original_value = entity["entity"] # Adjust positions based on previous replacements adj_start = start + offset adj_end = end + offset # Create masked token masked_token = f"[{entity_type}]" # Replace the entity with masked token masked_text = masked_text[:adj_start] + masked_token + masked_text[adj_end:] # Update offset for next replacement offset += len(masked_token) - (end - start) # Store masked entity info masked_entities.append({ "position": [start, end], "classification": entity_type, "entity": original_value }) return masked_text, masked_entities def _validate_luhn(self, card_number: str) -> bool: """Validate credit card number using Luhn algorithm.""" # Remove non-digit characters card_number = re.sub(r'[^0-9]', '', card_number) if not card_number.isdigit() or len(card_number) < 13 or len(card_number) > 19: return False digits = list(map(int, card_number)) checksum = digits[-1] total = 0 for i, digit in enumerate(digits[:-1]): if i % 2 == 0: digit *= 2 if digit > 9: digit -= 9 total += digit return (total * 9) % 10 == checksum def _validate_date(self, date_str: str) -> bool: """Validate date of birth.""" try: # Try to parse different date formats for fmt in ('%m/%d/%Y', '%m-%d-%Y', '%d/%m/%Y', '%d-%m-%Y', '%b %d, %Y', '%B %d, %Y'): try: datetime.strptime(date_str, fmt) return True except ValueError: continue return False except: return False