File size: 3,472 Bytes
748113b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse
import json
import re
import time
import sys
import logging

# Configure logging
logger = logging.getLogger(__name__)

class WebCrawler:
    def __init__(self, max_depth=2):
        self.base_url = None
        self.visited = set()
        self.max_depth = max_depth
        self.data = []
        self.session = requests.Session()
        self.delay = 0.1  # Delay between requests to prevent overwhelming the server
        logger.info(f"WebCrawler initialized with max_depth: {max_depth}")

    def can_crawl(self, url):
        logger.debug(f"Checking if can crawl: {url}")
        parsed_url = urlparse(url)
        robots_url = urljoin(f"{parsed_url.scheme}://{parsed_url.netloc}", '/robots.txt')
        try:
            response = self.session.get(robots_url, timeout=10)
            if response.status_code == 200:
                disallowed_paths = re.findall(r'Disallow: (.+)', response.text)
                for path in disallowed_paths:
                    if url.startswith(urljoin(self.base_url, path.strip())):
                        logger.info(f"Crawling not allowed for: {url}")
                        return False
        except requests.RequestException:
            logger.warning(f"Error fetching robots.txt for {url}", exc_info=True)
        logger.debug(f"Crawling allowed for: {url}")
        return True

    def fetch(self, url):
        logger.info(f"Fetching content from: {url}")
        try:
            response = self.session.get(url, timeout=10)
            response.raise_for_status()
            logger.debug(f"Successfully fetched content from: {url}")
            return response.text
        except requests.RequestException as e:
            logger.error(f"Error fetching {url}: {e}", exc_info=True)
            return None

    def parse(self, html_content, url):
        logger.info(f"Parsing HTML content from: {url}")
        soup = BeautifulSoup(html_content, 'html.parser')
        page_data = {
            'url': url,
            'headings': [heading.get_text(strip=True) for heading in soup.find_all(re.compile('^h[1-6]$'))],
            'paragraphs': [p.get_text(strip=True) for p in soup.find_all('p')],
        }
        self.data.append(page_data)
        links = [urljoin(url, a['href']) for a in soup.find_all('a', href=True)]
        logger.debug(f"Parsed {len(links)} links from {url}")
        return links

    def crawl(self, url, depth):
        if depth > self.max_depth or url in self.visited or not self.can_crawl(url):
            return

        logger.info(f"Crawling: {url} at depth {depth}")
        self.base_url = url
        self.visited.add(url)
        html_content = self.fetch(url)
        if html_content:
            links = self.parse(html_content, url)
            for link in links:
                if link.startswith(self.base_url):  # Stay within the same domain
                    time.sleep(self.delay)  # Respectful crawling
                    self.crawl(link, depth + 1)
        return self.get_data()

    def get_data(self):
        logger.info(f"Returning crawled data: {len(self.data)} pages")
        return self.data

if __name__ == "__main__":
    if len(sys.argv) != 2:
        print("Usage: python web_crawler.py <URL>")
        sys.exit(1)

    base_url = sys.argv[1]
    crawler = WebCrawler(max_depth=2)
    data = crawler.crawl(base_url, 0)
    print(json.dumps(data, indent=4))