File size: 3,334 Bytes
1b963f1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
"""
Input validation utilities - Policy Summarizer
"""
import re
from urllib.parse import urlparse
from typing import Tuple

# Maximum content length to process
MAX_CONTENT_LENGTH = 50000

# URL validation pattern
URL_PATTERN = re.compile(
    r'^https?://'
    r'(?:(?:[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\.)+[A-Z]{2,6}\.?|'
    r'localhost|'
    r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})'
    r'(?::\d+)?'
    r'(?:/?|[/?]\S+)$', re.IGNORECASE)


def validate_url(url: str) -> Tuple[bool, str]:
    """Validate if the URL is valid and safe to scrape."""
    if not url or not isinstance(url, str):
        return False, "URL cannot be empty"
    
    url = url.strip()
    
    if len(url) > 2048:
        return False, "URL is too long (max 2048 characters)"
    
    if not URL_PATTERN.match(url):
        return False, "Invalid URL format. Must start with http:// or https://"
    
    try:
        parsed = urlparse(url)
    except Exception as e:
        return False, f"Failed to parse URL: {str(e)}"
    
    if parsed.scheme not in ['http', 'https']:
        return False, "URL must use http or https protocol"
    
    if not parsed.netloc:
        return False, "URL must have a valid domain"
    
    blocked_hosts = ['localhost', '127.0.0.1', '0.0.0.0', '::1']
    if parsed.hostname and parsed.hostname.lower() in blocked_hosts:
        return False, "Cannot scrape localhost or private addresses"
    
    return True, ""


def is_likely_policy_url(url: str) -> bool:
    """Check if the URL likely points to a policy page."""
    keywords = ['privacy', 'policy', 'terms', 'tos', 'legal', 'service', 'conditions']
    url_lower = url.lower()
    return any(keyword in url_lower for keyword in keywords)


def sanitize_text(text: str) -> str:
    """Sanitize text content to prevent prompt injection."""
    if not text:
        return ""
    
    text = text.replace('\x00', '')
    text = re.sub(r'\n{3,}', '\n\n', text)
    text = re.sub(r' {3,}', '  ', text)
    
    # Remove potential prompt injection patterns
    injection_patterns = [
        r'ignore\s+(previous|above|all)\s+instructions',
        r'disregard\s+(previous|above|all)\s+instructions',
        r'forget\s+(previous|above|all)\s+instructions',
        r'new\s+instructions?\s*:',
        r'system\s*:\s*',
    ]
    
    for pattern in injection_patterns:
        text = re.sub(pattern, '[FILTERED]', text, flags=re.IGNORECASE)
    
    return text.strip()


def truncate_content(content: str, max_length: int = MAX_CONTENT_LENGTH) -> str:
    """Truncate content to maximum length while preserving sentences."""
    if len(content) <= max_length:
        return content
    
    truncated = content[:max_length]
    last_period = truncated.rfind('.')
    
    if last_period > max_length * 0.8:
        truncated = truncated[:last_period + 1]
    
    return truncated + "\n\n[Content truncated due to length...]"


def validate_content_length(content: str) -> Tuple[bool, str]:
    """Validate that content is not empty and not too short."""
    if not content or not content.strip():
        return False, "No content was extracted from the page"
    
    word_count = len(content.split())
    
    if word_count < 50:
        return False, f"Content too short ({word_count} words). This may not be a valid policy page."
    
    return True, ""