""" upif.modules.output_protection ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ The Data Loss Prevention (DLP) layer. Scans outgoing model responses for Personally Identifiable Information (PII) and Secrets (API Keys) to prevent leakage. :copyright: (c) 2025 Yash Dhone. :license: Proprietary, see LICENSE for details. """ import re from typing import Any, Optional, Dict, List from upif.core.interfaces import SecurityModule class OutputShield(SecurityModule): """ PII and Secret Redaction Shield. Targets: - Email Addresses - US Phone Numbers - SSN (Social Security Numbers) - Generic API Keys (sk-..., gh_-...) """ def __init__(self): # Compiled Regex Patterns for Performance self.patterns: List[Dict[str, Any]] = [ # Email (Standard RFC-ish) { "name": "EMAIL_REDACTED", "regex": re.compile(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b') }, # Phone (US Format mostly, simplistic) { "name": "PHONE_REDACTED", "regex": re.compile(r'\b\d{3}[-.]?\d{3}[-.]?\d{4}\b') }, # SSN (Simple) { "name": "SSN_REDACTED", "regex": re.compile(r'\b\d{3}-\d{2}-\d{4}\b') }, # API Keys (Common prefixes) { "name": "API_KEY_REDACTED", "regex": re.compile(r'\b(sk-[a-zA-Z0-9]{20,}|gh[pousr]-[a-zA-Z0-9]{20,})\b') } ] def scan(self, content: Any, metadata: Optional[Dict[str, Any]] = None) -> Any: """ Redacts Sensitive Info from the content string. Args: content (Any): Model response. Returns: str: Redacted string (e.g., "Email: [EMAIL_REDACTED]") """ if not isinstance(content, str): return content sanitized = content for p in self.patterns: # Replace found patterns with [NAME] # Efficient implementation: re.sub scans whole string sanitized = p["regex"].sub(f"[{p['name']}]", sanitized) return sanitized