Spaces:
Sleeping
Sleeping
| """ | |
| Input validation utilities - Policy Summarizer | |
| """ | |
| import re | |
| from urllib.parse import urlparse | |
| from typing import Tuple | |
| # Maximum content length to process | |
| MAX_CONTENT_LENGTH = 50000 | |
| # URL validation pattern | |
| URL_PATTERN = re.compile( | |
| r'^https?://' | |
| r'(?:(?:[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\.)+[A-Z]{2,6}\.?|' | |
| r'localhost|' | |
| r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})' | |
| r'(?::\d+)?' | |
| r'(?:/?|[/?]\S+)$', re.IGNORECASE) | |
| def validate_url(url: str) -> Tuple[bool, str]: | |
| """Validate if the URL is valid and safe to scrape.""" | |
| if not url or not isinstance(url, str): | |
| return False, "URL cannot be empty" | |
| url = url.strip() | |
| if len(url) > 2048: | |
| return False, "URL is too long (max 2048 characters)" | |
| if not URL_PATTERN.match(url): | |
| return False, "Invalid URL format. Must start with http:// or https://" | |
| try: | |
| parsed = urlparse(url) | |
| except Exception as e: | |
| return False, f"Failed to parse URL: {str(e)}" | |
| if parsed.scheme not in ['http', 'https']: | |
| return False, "URL must use http or https protocol" | |
| if not parsed.netloc: | |
| return False, "URL must have a valid domain" | |
| blocked_hosts = ['localhost', '127.0.0.1', '0.0.0.0', '::1'] | |
| if parsed.hostname and parsed.hostname.lower() in blocked_hosts: | |
| return False, "Cannot scrape localhost or private addresses" | |
| return True, "" | |
| def is_likely_policy_url(url: str) -> bool: | |
| """Check if the URL likely points to a policy page.""" | |
| keywords = ['privacy', 'policy', 'terms', 'tos', 'legal', 'service', 'conditions'] | |
| url_lower = url.lower() | |
| return any(keyword in url_lower for keyword in keywords) | |
| def sanitize_text(text: str) -> str: | |
| """Sanitize text content to prevent prompt injection.""" | |
| if not text: | |
| return "" | |
| text = text.replace('\x00', '') | |
| text = re.sub(r'\n{3,}', '\n\n', text) | |
| text = re.sub(r' {3,}', ' ', text) | |
| # Remove potential prompt injection patterns | |
| injection_patterns = [ | |
| r'ignore\s+(previous|above|all)\s+instructions', | |
| r'disregard\s+(previous|above|all)\s+instructions', | |
| r'forget\s+(previous|above|all)\s+instructions', | |
| r'new\s+instructions?\s*:', | |
| r'system\s*:\s*', | |
| ] | |
| for pattern in injection_patterns: | |
| text = re.sub(pattern, '[FILTERED]', text, flags=re.IGNORECASE) | |
| return text.strip() | |
| def truncate_content(content: str, max_length: int = MAX_CONTENT_LENGTH) -> str: | |
| """Truncate content to maximum length while preserving sentences.""" | |
| if len(content) <= max_length: | |
| return content | |
| truncated = content[:max_length] | |
| last_period = truncated.rfind('.') | |
| if last_period > max_length * 0.8: | |
| truncated = truncated[:last_period + 1] | |
| return truncated + "\n\n[Content truncated due to length...]" | |
| def validate_content_length(content: str) -> Tuple[bool, str]: | |
| """Validate that content is not empty and not too short.""" | |
| if not content or not content.strip(): | |
| return False, "No content was extracted from the page" | |
| word_count = len(content.split()) | |
| if word_count < 50: | |
| return False, f"Content too short ({word_count} words). This may not be a valid policy page." | |
| return True, "" | |