Spaces:

thomfoolery
/

AIE5-MidTerm

Sleeping

App Files Files Community

AIE5-MidTerm / scraper /async_crawler.py

thomfoolery

first attempt

465a7e3 11 months ago

raw

history blame contribute delete

7.71 kB

	#!/usr/bin/env python3
	import asyncio
	import logging
	from pathlib import Path
	from typing import Set, Dict
	import aiohttp
	from bs4 import BeautifulSoup
	from yarl import URL
	import json
	import re

	logging.basicConfig(level=logging.INFO)
	logger = logging.getLogger(__name__)

	class AsyncCrawler:
	def __init__(self, start_url: str, max_concurrent: int = 100):
	self.start_url = URL(start_url)
	self.base_domain = self.start_url.host
	self.base_path = str(self.start_url).split(self.base_domain)[1]
	self.visited_urls: Set[str] = set()
	self.url_queue: asyncio.Queue = asyncio.Queue()
	self.semaphore = asyncio.Semaphore(max_concurrent)
	self.session: aiohttp.ClientSession = None
	self.data_dir = Path("data/scraped")
	self.sitemap: Dict[str, list] = {}

	async def init_session(self):
	"""Initialize aiohttp session with optimal settings."""
	timeout = aiohttp.ClientTimeout(total=10)
	connector = aiohttp.TCPConnector(limit=100, ttl_dns_cache=300)
	self.session = aiohttp.ClientSession(
	timeout=timeout,
	connector=connector,
	headers={"User-Agent": "ShopBot/1.0"}
	)

	def is_valid_url(self, url: URL) -> bool:
	"""Check if URL should be crawled."""
	return (
	str(url).startswith(str(self.start_url))
	and url.scheme in ("http", "https")
	and not url.fragment
	)

	async def process_page(self, url: str, html: str) -> Set[str]:
	"""Extract links and save raw HTML."""

	# Regex pattern for Markdown links
	pattern = r'\[.*?\]\((https?://[^\)]+\|/[^)]+\|[^\)]+)\)'

	# Find all matches
	markdown_links = re.findall(pattern, html)

	soup = BeautifulSoup(html, 'html.parser')
	anchor_links = [a['href'] for a in soup.find_all('a', href=True)]

	links = markdown_links + anchor_links
	absolute_links = [
	str(URL(link)) if URL(link).host else str(self.start_url.join(URL(link)))
	for link in links
	]

	# concatenate the two sets

	# Filter out invalid URLs
	valid_links = {
	link for link in absolute_links
	if self.is_valid_url(URL
	(link))
	}

	# Save raw HTML
	# extract just the path from the url
	path = url.split(self.base_domain)[1]

	raw_filepath = self.data_dir / 'raw' / path.replace("/", "_").replace("_docs_apps_build_", "")
	raw_filepath.parent.mkdir(parents=True, exist_ok=True)

	raw_filepath.write_text(html)
	# raw_filepath.write_text(self.strip_all_html_tags_from_markdown(html))

	# Update sitemap
	self.sitemap[url] = list(valid_links)

	return valid_links

	async def fetch_page(self, url: str) -> None:
	"""Fetch and process a single page."""
	if url in self.visited_urls:
	return

	self.visited_urls.add(url)

	try:
	async with self.semaphore:
	async with self.session.get(url) as response:
	if response.status == 200:
	html = await response.text()
	new_urls = await self.process_page(url, html)

	for new_url in new_urls:
	if new_url not in self.visited_urls:
	await self.url_queue.put(new_url)

	logger.info(f"Successfully processed: {url}")
	else:
	logger.warning(f"Failed to fetch {url}: {response.status}")
	except Exception as e:
	logger.error(f"Error processing {url}: {str(e)}")

	def strip_all_html_tags_from_markdown(self, markdown: str) -> str:
	"""Remove all HTML tags from a string, except for opening and closing script tags."""
	# Define regex patterns to remove specific HTML tags
	patterns = [
	r'<div class="react-code-block" data-preset="file">\n',
	r'<div class="react-code-block" data-preset="basic">\n',
	r'<div class="react-code-block" data-preset="terminal">\n',
	r'<div class="react-code-block-preload ThemeMode-dim">\n',
	r'<div class="react-code-block-preload-bar "></div>\n',
	r'<div class="react-code-block-preload-bar basic-codeblock">',
	r'<div class="react-code-block-preload-placeholder-container">\n',
	r'<div class="react-code-block-preload-code-container">\n',
	r'<div class="react-code-block-preload-codeline-number"></div>\n',
	r'<div class="react-code-block-preload-codeline"></div>\n',
	r'<script data-option=[^>]+ data-value=[^>]+></script>\n',
	r'<div>\n',
	r'</div>\n',
	r'<br>\n',
	r'<p>\n',
	r'</p>\n',
	# r'<(?!script\b)[^>]+>',
	# r'</(?!script\b)[^>]+>',
	r'END_RAW_MD_CONTENT',
	r'RAW_MD_CONTENT',
	]

	# Remove all matched patterns from the markdown
	for pattern in patterns:
	markdown = re.sub(pattern, '', markdown)

	markdown = re.sub(r'<script type="text/plain"[^>]+language="([^"]+)"[^>]*>', r'```\1', markdown)
	markdown = re.sub(r'</script>', '```', markdown)

	# replace 3 or more new lines with 2 new lines
	markdown = re.sub(r'\n{3,}', '\n\n', markdown)

	return markdown

	def clean_raw_markdown(self):
	"""Clean raw markdown files by stripping HTML tags."""
	raw_dir = self.data_dir / 'raw'
	for raw_file in raw_dir.glob('*.txt'):
	content = raw_file.read_text()
	cleaned_content = self.strip_all_html_tags_from_markdown(content)

	raw_filepath = self.data_dir / 'clean' / raw_file.name
	raw_filepath.parent.mkdir(parents=True, exist_ok=True)
	raw_filepath.write_text(cleaned_content)

	async def run(self):
	"""Main crawler execution."""
	# Create data directory
	self.data_dir.mkdir(parents=True, exist_ok=True)

	await self.init_session()
	await self.url_queue.put(str(self.start_url))

	try:
	workers = []
	while True:

	if self.url_queue.empty() and not workers:
	break

	while not self.url_queue.empty():
	url = await self.url_queue.get() + '.txt'

	if url not in self.visited_urls:
	worker = asyncio.create_task(self.fetch_page(url))
	workers.append(worker)

	if workers:
	done, pending = await asyncio.wait(
	workers,
	return_when=asyncio.FIRST_COMPLETED
	)
	workers = list(pending)
	for task in done:
	await task
	finally:
	# Save sitemap
	sitemap_path = self.data_dir / "_sitemap.json"
	sitemap_path.write_text(json.dumps(self.sitemap, indent=2))
	self.clean_raw_markdown()

	await self.session.close()
	logger.info(f"Crawl completed. Processed {len(self.visited_urls)} pages.")

	async def main():
	start_url = "https://shopify.dev/docs/apps/build/flow"
	crawler = AsyncCrawler(start_url)
	await crawler.run()

	if __name__ == "__main__":
	asyncio.run(main())