File size: 8,503 Bytes
e272f4f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse
from typing import List, Set, Optional, Dict
import logging
import re
from app.config import Config
import aiohttp

class URLCrawler:
    """
    A web crawler that extracts and processes content from websites.
    Handles both synchronous and asynchronous crawling operations.
    
    Features:
    - URL validation and sanitization
    - Content extraction with noise removal
    - Breadth-first crawling with configurable depth
    - Respects robots.txt and avoids non-html content
    """
    
    def __init__(self):
        """Initialize the crawler with default settings."""
        self.visited_urls: Set[str] = set()  # Tracks crawled URLs to avoid duplicates
        self.logger = logging.getLogger(__name__)
        # Configure headers to mimic a real browser
        self.headers = {
            'User-Agent': 'Mozilla/5.0 (compatible; RAGBot/1.0)',
            'Accept-Language': 'en-US,en;q=0.9'
        }

    def is_valid_url(self, url: str, base_domain: str) -> bool:
        """
        Validate if a URL should be crawled.
        
        Args:
            url: URL to validate
            base_domain: The target domain to stay within
            
        Returns:
            bool: True if URL is crawlable
        """
        parsed = urlparse(url)
        return (parsed.scheme in ('http', 'https') and  # Only HTTP/HTTPS
                parsed.netloc == base_domain and  # Stay within target domain
                not any(ext in url.lower()  # Skip binary files
                       for ext in ['.pdf', '.jpg', '.png', '.zip']) and
                url not in self.visited_urls)  # Avoid duplicates

    def sanitize_url(self, url: str) -> str:
        """
        Normalize URL by removing fragments and query parameters.
        
        Args:
            url: URL to sanitize
            
        Returns:
            str: Normalized URL
        """
        parsed = urlparse(url)
        return f"{parsed.scheme}://{parsed.netloc}{parsed.path.rstrip('/')}"

    def clean_text(self, text: str) -> str:
        """
        Clean and normalize extracted text content.
        
        Args:
            text: Raw extracted text
            
        Returns:
            str: Cleaned text content
        """
        # Remove excessive whitespace
        text = re.sub(r'\s+', ' ', text)
        # Remove common boilerplate
        text = re.sub(r'(\b(privacy policy|terms of service|cookie policy)\b|\b\d+\s*(comments|shares|likes)\b)', '', text, flags=re.I)
        # Remove short lines (likely not meaningful content)
        return '\n'.join(line for line in text.split('\n') 
                        if len(line.strip()) > 30)

    def extract_main_content(self, soup: BeautifulSoup) -> str:
        """
        Extract primary content from HTML using semantic heuristics.
        
        Args:
            soup: BeautifulSoup parsed HTML document
            
        Returns:
            str: Extracted main content
        """
        # Remove unwanted elements that typically don't contain main content
        for element in soup(['script', 'style', 'nav', 'footer', 
                           'header', 'iframe', 'aside', 'form']):
            element.decompose()

        # Prioritize semantic HTML containers that likely contain main content
        for tag in ['article', 'main', 'section[role="main"]', '.content']:
            content = soup.select_one(tag)
            if content:
                return self.clean_text(content.get_text(separator='\n'))
        
        # Fallback to body if no semantic containers found
        return self.clean_text(soup.body.get_text(separator='\n'))

    def get_page_content(self, url: str) -> Optional[Dict]:
        """
        Fetch and process a single web page.
        
        Args:
            url: URL to fetch
            
        Returns:
            Optional[Dict]: Structured page data or None if invalid
        """
        try:
            response = requests.get(url, headers=self.headers, timeout=15)
            response.raise_for_status()
            
            # Skip non-HTML content
            if 'text/html' not in response.headers.get('Content-Type', ''):
                return None

            soup = BeautifulSoup(response.text, 'lxml')
            title = soup.title.string if soup.title else urlparse(url).path
            content = self.extract_main_content(soup)
            
            # Skip pages with insufficient content
            if len(content.split()) < 100:  # Minimum 100 words
                return None
                
            return {
                'url': url,
                'title': title,
                'content': content,
                'last_modified': response.headers.get('Last-Modified', '')
            }
            
        except Exception as e:
            self.logger.warning(f"Error processing {url}: {str(e)}")
            return None

    def extract_links(self, url: str, soup: BeautifulSoup) -> List[str]:
        """
        Extract all crawlable links from a page.
        
        Args:
            url: Base URL for relative link resolution
            soup: Parsed HTML document
            
        Returns:
            List[str]: List of absolute URLs to crawl
        """
        base_domain = urlparse(url).netloc
        links = set()
        
        for link in soup.find_all('a', href=True):
            href = link['href'].split('#')[0]  # Remove fragments
            if not href or href.startswith('javascript:'):
                continue
                
            absolute_url = urljoin(url, href)
            sanitized_url = self.sanitize_url(absolute_url)
            
            if self.is_valid_url(sanitized_url, base_domain):
                links.add(sanitized_url)
        
        return sorted(links)[:Config.MAX_LINKS_PER_PAGE]  # Apply limit

    async def crawl(self, url: str) -> str:
        """
        Asynchronously crawl a single URL and return its text content.
        
        Args:
            url: URL to crawl
            
        Returns:
            str: Extracted text content
            
        Raises:
            Exception: If crawling fails
        """
        try:
            async with aiohttp.ClientSession() as session:
                async with session.get(url) as response:
                    html = await response.text()
                    soup = BeautifulSoup(html, 'html.parser')
                    # Remove script and style elements
                    for script in soup(["script", "style"]):
                        script.decompose()
                    return soup.get_text()
        except Exception as e:
            self.logger.error(f"Crawling error: {str(e)}")
            raise

    def crawl_sync(self, start_url: str, max_pages: int = Config.MAX_PAGES_TO_CRAWL) -> List[Dict]:
        """
        Synchronously crawl a website using breadth-first search.
        
        Args:
            start_url: Initial URL to begin crawling
            max_pages: Maximum number of pages to crawl
            
        Returns:
            List[Dict]: Structured documents from crawled pages
        """
        base_domain = urlparse(start_url).netloc
        queue = [start_url]  # URLs to crawl
        documents = []  # Collected documents
        
        while queue and len(documents) < max_pages:
            current_url = queue.pop(0)
            sanitized_url = self.sanitize_url(current_url)
            
            if sanitized_url in self.visited_urls:
                continue
                
            self.visited_urls.add(sanitized_url)
            self.logger.info(f"Crawling: {sanitized_url}")
            
            page_data = self.get_page_content(sanitized_url)
            if not page_data:
                continue
                
            documents.append(page_data)
            
            # Get links for further crawling
            try:
                response = requests.get(sanitized_url, headers=self.headers, timeout=10)
                soup = BeautifulSoup(response.text, 'lxml')
                new_links = self.extract_links(sanitized_url, soup)
                queue.extend(link for link in new_links 
                            if link not in self.visited_urls)
            except Exception as e:
                self.logger.warning(f"Error getting links from {sanitized_url}: {str(e)}")
        
        return documents