| """PII auto-detection for catalog columns. | |
| When pii_flag is set True, sample_values is forced to None so real PII | |
| never enters LLM prompts. Patterns live in src/security/pii_patterns.py. | |
| """ | |
| from src.security.pii_patterns import EMAIL_REGEX, PHONE_REGEX, PII_NAME_PATTERNS | |
| from .models import Column | |
| class PIIDetector: | |
| """Marks columns as pii_flag=True when name or sampled values look sensitive. | |
| Bias is intentional: false positives hide harmless sample values, | |
| false negatives leak data. When unsure, flag. | |
| """ | |
| def detect(self, column: Column) -> bool: | |
| if self._name_matches(column.name): | |
| return True | |
| if column.sample_values and self._values_match(column.sample_values): | |
| return True | |
| return False | |
| def _name_matches(name: str) -> bool: | |
| lowered = name.lower() | |
| return any(pat in lowered for pat in PII_NAME_PATTERNS) | |
| def _values_match(values: list) -> bool: | |
| for v in values: | |
| if v is None: | |
| continue | |
| s = str(v) | |
| if EMAIL_REGEX.match(s) or PHONE_REGEX.match(s): | |
| return True | |
| return False | |