Spaces:

mmcc007
/

lazzloe.com

Sleeping

App Files Files Community

lazzloe.com / web_crawler.py

mmcc007

Upload folder using huggingface_hub

748113b verified over 1 year ago

raw

history blame contribute delete

3.47 kB

	import requests
	from bs4 import BeautifulSoup
	from urllib.parse import urljoin, urlparse
	import json
	import re
	import time
	import sys
	import logging

	# Configure logging
	logger = logging.getLogger(__name__)

	class WebCrawler:
	def __init__(self, max_depth=2):
	self.base_url = None
	self.visited = set()
	self.max_depth = max_depth
	self.data = []
	self.session = requests.Session()
	self.delay = 0.1 # Delay between requests to prevent overwhelming the server
	logger.info(f"WebCrawler initialized with max_depth: {max_depth}")

	def can_crawl(self, url):
	logger.debug(f"Checking if can crawl: {url}")
	parsed_url = urlparse(url)
	robots_url = urljoin(f"{parsed_url.scheme}://{parsed_url.netloc}", '/robots.txt')
	try:
	response = self.session.get(robots_url, timeout=10)
	if response.status_code == 200:
	disallowed_paths = re.findall(r'Disallow: (.+)', response.text)
	for path in disallowed_paths:
	if url.startswith(urljoin(self.base_url, path.strip())):
	logger.info(f"Crawling not allowed for: {url}")
	return False
	except requests.RequestException:
	logger.warning(f"Error fetching robots.txt for {url}", exc_info=True)
	logger.debug(f"Crawling allowed for: {url}")
	return True

	def fetch(self, url):
	logger.info(f"Fetching content from: {url}")
	try:
	response = self.session.get(url, timeout=10)
	response.raise_for_status()
	logger.debug(f"Successfully fetched content from: {url}")
	return response.text
	except requests.RequestException as e:
	logger.error(f"Error fetching {url}: {e}", exc_info=True)
	return None

	def parse(self, html_content, url):
	logger.info(f"Parsing HTML content from: {url}")
	soup = BeautifulSoup(html_content, 'html.parser')
	page_data = {
	'url': url,
	'headings': [heading.get_text(strip=True) for heading in soup.find_all(re.compile('^h[1-6]$'))],
	'paragraphs': [p.get_text(strip=True) for p in soup.find_all('p')],
	}
	self.data.append(page_data)
	links = [urljoin(url, a['href']) for a in soup.find_all('a', href=True)]
	logger.debug(f"Parsed {len(links)} links from {url}")
	return links

	def crawl(self, url, depth):
	if depth > self.max_depth or url in self.visited or not self.can_crawl(url):
	return

	logger.info(f"Crawling: {url} at depth {depth}")
	self.base_url = url
	self.visited.add(url)
	html_content = self.fetch(url)
	if html_content:
	links = self.parse(html_content, url)
	for link in links:
	if link.startswith(self.base_url): # Stay within the same domain
	time.sleep(self.delay) # Respectful crawling
	self.crawl(link, depth + 1)
	return self.get_data()

	def get_data(self):
	logger.info(f"Returning crawled data: {len(self.data)} pages")
	return self.data

	if __name__ == "__main__":
	if len(sys.argv) != 2:
	print("Usage: python web_crawler.py <URL>")
	sys.exit(1)

	base_url = sys.argv[1]
	crawler = WebCrawler(max_depth=2)
	data = crawler.crawl(base_url, 0)
	print(json.dumps(data, indent=4))