Spaces:
Sleeping
Sleeping
File size: 3,472 Bytes
748113b | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 | import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse
import json
import re
import time
import sys
import logging
# Configure logging
logger = logging.getLogger(__name__)
class WebCrawler:
def __init__(self, max_depth=2):
self.base_url = None
self.visited = set()
self.max_depth = max_depth
self.data = []
self.session = requests.Session()
self.delay = 0.1 # Delay between requests to prevent overwhelming the server
logger.info(f"WebCrawler initialized with max_depth: {max_depth}")
def can_crawl(self, url):
logger.debug(f"Checking if can crawl: {url}")
parsed_url = urlparse(url)
robots_url = urljoin(f"{parsed_url.scheme}://{parsed_url.netloc}", '/robots.txt')
try:
response = self.session.get(robots_url, timeout=10)
if response.status_code == 200:
disallowed_paths = re.findall(r'Disallow: (.+)', response.text)
for path in disallowed_paths:
if url.startswith(urljoin(self.base_url, path.strip())):
logger.info(f"Crawling not allowed for: {url}")
return False
except requests.RequestException:
logger.warning(f"Error fetching robots.txt for {url}", exc_info=True)
logger.debug(f"Crawling allowed for: {url}")
return True
def fetch(self, url):
logger.info(f"Fetching content from: {url}")
try:
response = self.session.get(url, timeout=10)
response.raise_for_status()
logger.debug(f"Successfully fetched content from: {url}")
return response.text
except requests.RequestException as e:
logger.error(f"Error fetching {url}: {e}", exc_info=True)
return None
def parse(self, html_content, url):
logger.info(f"Parsing HTML content from: {url}")
soup = BeautifulSoup(html_content, 'html.parser')
page_data = {
'url': url,
'headings': [heading.get_text(strip=True) for heading in soup.find_all(re.compile('^h[1-6]$'))],
'paragraphs': [p.get_text(strip=True) for p in soup.find_all('p')],
}
self.data.append(page_data)
links = [urljoin(url, a['href']) for a in soup.find_all('a', href=True)]
logger.debug(f"Parsed {len(links)} links from {url}")
return links
def crawl(self, url, depth):
if depth > self.max_depth or url in self.visited or not self.can_crawl(url):
return
logger.info(f"Crawling: {url} at depth {depth}")
self.base_url = url
self.visited.add(url)
html_content = self.fetch(url)
if html_content:
links = self.parse(html_content, url)
for link in links:
if link.startswith(self.base_url): # Stay within the same domain
time.sleep(self.delay) # Respectful crawling
self.crawl(link, depth + 1)
return self.get_data()
def get_data(self):
logger.info(f"Returning crawled data: {len(self.data)} pages")
return self.data
if __name__ == "__main__":
if len(sys.argv) != 2:
print("Usage: python web_crawler.py <URL>")
sys.exit(1)
base_url = sys.argv[1]
crawler = WebCrawler(max_depth=2)
data = crawler.crawl(base_url, 0)
print(json.dumps(data, indent=4)) |