ishaq101's picture
feat/Catalog Retrieval System (#1)
6bff5d9
"""PII auto-detection for catalog columns.
When pii_flag is set True, sample_values is forced to None so real PII
never enters LLM prompts. Patterns live in src/security/pii_patterns.py.
"""
from src.security.pii_patterns import EMAIL_REGEX, PHONE_REGEX, PII_NAME_PATTERNS
from .models import Column
class PIIDetector:
"""Marks columns as pii_flag=True when name or sampled values look sensitive.
Bias is intentional: false positives hide harmless sample values,
false negatives leak data. When unsure, flag.
"""
def detect(self, column: Column) -> bool:
if self._name_matches(column.name):
return True
if column.sample_values and self._values_match(column.sample_values):
return True
return False
@staticmethod
def _name_matches(name: str) -> bool:
lowered = name.lower()
return any(pat in lowered for pat in PII_NAME_PATTERNS)
@staticmethod
def _values_match(values: list) -> bool:
for v in values:
if v is None:
continue
s = str(v)
if EMAIL_REGEX.match(s) or PHONE_REGEX.match(s):
return True
return False