Spaces:
Sleeping
Sleeping
File size: 3,334 Bytes
1b963f1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 | """
Input validation utilities - Policy Summarizer
"""
import re
from urllib.parse import urlparse
from typing import Tuple
# Maximum content length to process
MAX_CONTENT_LENGTH = 50000
# URL validation pattern
URL_PATTERN = re.compile(
r'^https?://'
r'(?:(?:[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\.)+[A-Z]{2,6}\.?|'
r'localhost|'
r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})'
r'(?::\d+)?'
r'(?:/?|[/?]\S+)$', re.IGNORECASE)
def validate_url(url: str) -> Tuple[bool, str]:
"""Validate if the URL is valid and safe to scrape."""
if not url or not isinstance(url, str):
return False, "URL cannot be empty"
url = url.strip()
if len(url) > 2048:
return False, "URL is too long (max 2048 characters)"
if not URL_PATTERN.match(url):
return False, "Invalid URL format. Must start with http:// or https://"
try:
parsed = urlparse(url)
except Exception as e:
return False, f"Failed to parse URL: {str(e)}"
if parsed.scheme not in ['http', 'https']:
return False, "URL must use http or https protocol"
if not parsed.netloc:
return False, "URL must have a valid domain"
blocked_hosts = ['localhost', '127.0.0.1', '0.0.0.0', '::1']
if parsed.hostname and parsed.hostname.lower() in blocked_hosts:
return False, "Cannot scrape localhost or private addresses"
return True, ""
def is_likely_policy_url(url: str) -> bool:
"""Check if the URL likely points to a policy page."""
keywords = ['privacy', 'policy', 'terms', 'tos', 'legal', 'service', 'conditions']
url_lower = url.lower()
return any(keyword in url_lower for keyword in keywords)
def sanitize_text(text: str) -> str:
"""Sanitize text content to prevent prompt injection."""
if not text:
return ""
text = text.replace('\x00', '')
text = re.sub(r'\n{3,}', '\n\n', text)
text = re.sub(r' {3,}', ' ', text)
# Remove potential prompt injection patterns
injection_patterns = [
r'ignore\s+(previous|above|all)\s+instructions',
r'disregard\s+(previous|above|all)\s+instructions',
r'forget\s+(previous|above|all)\s+instructions',
r'new\s+instructions?\s*:',
r'system\s*:\s*',
]
for pattern in injection_patterns:
text = re.sub(pattern, '[FILTERED]', text, flags=re.IGNORECASE)
return text.strip()
def truncate_content(content: str, max_length: int = MAX_CONTENT_LENGTH) -> str:
"""Truncate content to maximum length while preserving sentences."""
if len(content) <= max_length:
return content
truncated = content[:max_length]
last_period = truncated.rfind('.')
if last_period > max_length * 0.8:
truncated = truncated[:last_period + 1]
return truncated + "\n\n[Content truncated due to length...]"
def validate_content_length(content: str) -> Tuple[bool, str]:
"""Validate that content is not empty and not too short."""
if not content or not content.strip():
return False, "No content was extracted from the page"
word_count = len(content.split())
if word_count < 50:
return False, f"Content too short ({word_count} words). This may not be a valid policy page."
return True, ""
|