"""PII auto-detection for catalog columns. When pii_flag is set True, sample_values is forced to None so real PII never enters LLM prompts. Patterns live in src/security/pii_patterns.py. """ from src.security.pii_patterns import EMAIL_REGEX, PHONE_REGEX, PII_NAME_PATTERNS from .models import Column class PIIDetector: """Marks columns as pii_flag=True when name or sampled values look sensitive. Bias is intentional: false positives hide harmless sample values, false negatives leak data. When unsure, flag. """ def detect(self, column: Column) -> bool: if self._name_matches(column.name): return True if column.sample_values and self._values_match(column.sample_values): return True return False @staticmethod def _name_matches(name: str) -> bool: lowered = name.lower() return any(pat in lowered for pat in PII_NAME_PATTERNS) @staticmethod def _values_match(values: list) -> bool: for v in values: if v is None: continue s = str(v) if EMAIL_REGEX.match(s) or PHONE_REGEX.match(s): return True return False