""" Input validation utilities - Policy Summarizer """ import re from urllib.parse import urlparse from typing import Tuple # Maximum content length to process MAX_CONTENT_LENGTH = 50000 # URL validation pattern URL_PATTERN = re.compile( r'^https?://' r'(?:(?:[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\.)+[A-Z]{2,6}\.?|' r'localhost|' r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})' r'(?::\d+)?' r'(?:/?|[/?]\S+)$', re.IGNORECASE) def validate_url(url: str) -> Tuple[bool, str]: """Validate if the URL is valid and safe to scrape.""" if not url or not isinstance(url, str): return False, "URL cannot be empty" url = url.strip() if len(url) > 2048: return False, "URL is too long (max 2048 characters)" if not URL_PATTERN.match(url): return False, "Invalid URL format. Must start with http:// or https://" try: parsed = urlparse(url) except Exception as e: return False, f"Failed to parse URL: {str(e)}" if parsed.scheme not in ['http', 'https']: return False, "URL must use http or https protocol" if not parsed.netloc: return False, "URL must have a valid domain" blocked_hosts = ['localhost', '127.0.0.1', '0.0.0.0', '::1'] if parsed.hostname and parsed.hostname.lower() in blocked_hosts: return False, "Cannot scrape localhost or private addresses" return True, "" def is_likely_policy_url(url: str) -> bool: """Check if the URL likely points to a policy page.""" keywords = ['privacy', 'policy', 'terms', 'tos', 'legal', 'service', 'conditions'] url_lower = url.lower() return any(keyword in url_lower for keyword in keywords) def sanitize_text(text: str) -> str: """Sanitize text content to prevent prompt injection.""" if not text: return "" text = text.replace('\x00', '') text = re.sub(r'\n{3,}', '\n\n', text) text = re.sub(r' {3,}', ' ', text) # Remove potential prompt injection patterns injection_patterns = [ r'ignore\s+(previous|above|all)\s+instructions', r'disregard\s+(previous|above|all)\s+instructions', r'forget\s+(previous|above|all)\s+instructions', r'new\s+instructions?\s*:', r'system\s*:\s*', ] for pattern in injection_patterns: text = re.sub(pattern, '[FILTERED]', text, flags=re.IGNORECASE) return text.strip() def truncate_content(content: str, max_length: int = MAX_CONTENT_LENGTH) -> str: """Truncate content to maximum length while preserving sentences.""" if len(content) <= max_length: return content truncated = content[:max_length] last_period = truncated.rfind('.') if last_period > max_length * 0.8: truncated = truncated[:last_period + 1] return truncated + "\n\n[Content truncated due to length...]" def validate_content_length(content: str) -> Tuple[bool, str]: """Validate that content is not empty and not too short.""" if not content or not content.strip(): return False, "No content was extracted from the page" word_count = len(content.split()) if word_count < 50: return False, f"Content too short ({word_count} words). This may not be a valid policy page." return True, ""