Spaces:
Sleeping
Sleeping
| import requests | |
| from bs4 import BeautifulSoup | |
| from urllib.parse import urljoin, urlparse | |
| import json | |
| import re | |
| import time | |
| import sys | |
| import logging | |
| # Configure logging | |
| logger = logging.getLogger(__name__) | |
| class WebCrawler: | |
| def __init__(self, max_depth=2): | |
| self.base_url = None | |
| self.visited = set() | |
| self.max_depth = max_depth | |
| self.data = [] | |
| self.session = requests.Session() | |
| self.delay = 0.1 # Delay between requests to prevent overwhelming the server | |
| logger.info(f"WebCrawler initialized with max_depth: {max_depth}") | |
| def can_crawl(self, url): | |
| logger.debug(f"Checking if can crawl: {url}") | |
| parsed_url = urlparse(url) | |
| robots_url = urljoin(f"{parsed_url.scheme}://{parsed_url.netloc}", '/robots.txt') | |
| try: | |
| response = self.session.get(robots_url, timeout=10) | |
| if response.status_code == 200: | |
| disallowed_paths = re.findall(r'Disallow: (.+)', response.text) | |
| for path in disallowed_paths: | |
| if url.startswith(urljoin(self.base_url, path.strip())): | |
| logger.info(f"Crawling not allowed for: {url}") | |
| return False | |
| except requests.RequestException: | |
| logger.warning(f"Error fetching robots.txt for {url}", exc_info=True) | |
| logger.debug(f"Crawling allowed for: {url}") | |
| return True | |
| def fetch(self, url): | |
| logger.info(f"Fetching content from: {url}") | |
| try: | |
| response = self.session.get(url, timeout=10) | |
| response.raise_for_status() | |
| logger.debug(f"Successfully fetched content from: {url}") | |
| return response.text | |
| except requests.RequestException as e: | |
| logger.error(f"Error fetching {url}: {e}", exc_info=True) | |
| return None | |
| def parse(self, html_content, url): | |
| logger.info(f"Parsing HTML content from: {url}") | |
| soup = BeautifulSoup(html_content, 'html.parser') | |
| page_data = { | |
| 'url': url, | |
| 'headings': [heading.get_text(strip=True) for heading in soup.find_all(re.compile('^h[1-6]$'))], | |
| 'paragraphs': [p.get_text(strip=True) for p in soup.find_all('p')], | |
| } | |
| self.data.append(page_data) | |
| links = [urljoin(url, a['href']) for a in soup.find_all('a', href=True)] | |
| logger.debug(f"Parsed {len(links)} links from {url}") | |
| return links | |
| def crawl(self, url, depth): | |
| if depth > self.max_depth or url in self.visited or not self.can_crawl(url): | |
| return | |
| logger.info(f"Crawling: {url} at depth {depth}") | |
| self.base_url = url | |
| self.visited.add(url) | |
| html_content = self.fetch(url) | |
| if html_content: | |
| links = self.parse(html_content, url) | |
| for link in links: | |
| if link.startswith(self.base_url): # Stay within the same domain | |
| time.sleep(self.delay) # Respectful crawling | |
| self.crawl(link, depth + 1) | |
| return self.get_data() | |
| def get_data(self): | |
| logger.info(f"Returning crawled data: {len(self.data)} pages") | |
| return self.data | |
| if __name__ == "__main__": | |
| if len(sys.argv) != 2: | |
| print("Usage: python web_crawler.py <URL>") | |
| sys.exit(1) | |
| base_url = sys.argv[1] | |
| crawler = WebCrawler(max_depth=2) | |
| data = crawler.crawl(base_url, 0) | |
| print(json.dumps(data, indent=4)) |